diff options
Diffstat (limited to 'kernel')
106 files changed, 8473 insertions, 6985 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472fbc272..0b72d1a74be0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o | |||
70 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | 70 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o |
71 | obj-$(CONFIG_SMP) += stop_machine.o | 71 | obj-$(CONFIG_SMP) += stop_machine.o |
72 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 72 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
73 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o | 73 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
74 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 74 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
75 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | 75 | obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o |
76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
77 | obj-$(CONFIG_GCOV_KERNEL) += gcov/ | ||
77 | obj-$(CONFIG_KPROBES) += kprobes.o | 78 | obj-$(CONFIG_KPROBES) += kprobes.o |
78 | obj-$(CONFIG_KGDB) += debug/ | 79 | obj-$(CONFIG_KGDB) += debug/ |
79 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||
80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
81 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | ||
81 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 82 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
82 | obj-$(CONFIG_SECCOMP) += seccomp.o | 83 | obj-$(CONFIG_SECCOMP) += seccomp.o |
83 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/ | |||
99 | obj-$(CONFIG_X86_DS) += trace/ | 100 | obj-$(CONFIG_X86_DS) += trace/ |
100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 101 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_SMP) += sched_cpupri.o | 102 | obj-$(CONFIG_SMP) += sched_cpupri.o |
102 | obj-$(CONFIG_SLOW_WORK) += slow-work.o | ||
103 | obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | ||
104 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 103 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
105 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 104 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
106 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 105 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 385b88461c29..fa7eb3de2ddc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
122 | spin_unlock(&acct_lock); | 122 | spin_unlock(&acct_lock); |
123 | 123 | ||
124 | /* May block */ | 124 | /* May block */ |
125 | if (vfs_statfs(file->f_path.dentry, &sbuf)) | 125 | if (vfs_statfs(&file->f_path, &sbuf)) |
126 | return res; | 126 | return res; |
127 | suspend = sbuf.f_blocks * SUSPEND; | 127 | suspend = sbuf.f_blocks * SUSPEND; |
128 | resume = sbuf.f_blocks * RESUME; | 128 | resume = sbuf.f_blocks * RESUME; |
diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18fe..cd9dbb913c77 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. | |||
49 | */ | 49 | */ |
50 | 50 | ||
51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
52 | #include <linux/bug.h> | ||
53 | #include <linux/module.h> | 52 | #include <linux/module.h> |
54 | #include <linux/wait.h> | 53 | #include <linux/wait.h> |
55 | #include <linux/sched.h> | 54 | #include <linux/sched.h> |
56 | #include <linux/init.h> | ||
57 | #include <linux/kthread.h> | ||
58 | #include <linux/delay.h> | ||
59 | #include <linux/slab.h> | 55 | #include <linux/slab.h> |
56 | #include <linux/workqueue.h> | ||
60 | #include <asm/atomic.h> | 57 | #include <asm/atomic.h> |
61 | 58 | ||
62 | static async_cookie_t next_cookie = 1; | 59 | static async_cookie_t next_cookie = 1; |
63 | 60 | ||
64 | #define MAX_THREADS 256 | ||
65 | #define MAX_WORK 32768 | 61 | #define MAX_WORK 32768 |
66 | 62 | ||
67 | static LIST_HEAD(async_pending); | 63 | static LIST_HEAD(async_pending); |
68 | static LIST_HEAD(async_running); | 64 | static LIST_HEAD(async_running); |
69 | static DEFINE_SPINLOCK(async_lock); | 65 | static DEFINE_SPINLOCK(async_lock); |
70 | 66 | ||
71 | static int async_enabled = 0; | ||
72 | |||
73 | struct async_entry { | 67 | struct async_entry { |
74 | struct list_head list; | 68 | struct list_head list; |
75 | async_cookie_t cookie; | 69 | struct work_struct work; |
76 | async_func_ptr *func; | 70 | async_cookie_t cookie; |
77 | void *data; | 71 | async_func_ptr *func; |
78 | struct list_head *running; | 72 | void *data; |
73 | struct list_head *running; | ||
79 | }; | 74 | }; |
80 | 75 | ||
81 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | 76 | static DECLARE_WAIT_QUEUE_HEAD(async_done); |
82 | static DECLARE_WAIT_QUEUE_HEAD(async_new); | ||
83 | 77 | ||
84 | static atomic_t entry_count; | 78 | static atomic_t entry_count; |
85 | static atomic_t thread_count; | ||
86 | 79 | ||
87 | extern int initcall_debug; | 80 | extern int initcall_debug; |
88 | 81 | ||
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) | |||
117 | spin_unlock_irqrestore(&async_lock, flags); | 110 | spin_unlock_irqrestore(&async_lock, flags); |
118 | return ret; | 111 | return ret; |
119 | } | 112 | } |
113 | |||
120 | /* | 114 | /* |
121 | * pick the first pending entry and run it | 115 | * pick the first pending entry and run it |
122 | */ | 116 | */ |
123 | static void run_one_entry(void) | 117 | static void async_run_entry_fn(struct work_struct *work) |
124 | { | 118 | { |
119 | struct async_entry *entry = | ||
120 | container_of(work, struct async_entry, work); | ||
125 | unsigned long flags; | 121 | unsigned long flags; |
126 | struct async_entry *entry; | ||
127 | ktime_t calltime, delta, rettime; | 122 | ktime_t calltime, delta, rettime; |
128 | 123 | ||
129 | /* 1) pick one task from the pending queue */ | 124 | /* 1) move self to the running queue */ |
130 | |||
131 | spin_lock_irqsave(&async_lock, flags); | 125 | spin_lock_irqsave(&async_lock, flags); |
132 | if (list_empty(&async_pending)) | ||
133 | goto out; | ||
134 | entry = list_first_entry(&async_pending, struct async_entry, list); | ||
135 | |||
136 | /* 2) move it to the running queue */ | ||
137 | list_move_tail(&entry->list, entry->running); | 126 | list_move_tail(&entry->list, entry->running); |
138 | spin_unlock_irqrestore(&async_lock, flags); | 127 | spin_unlock_irqrestore(&async_lock, flags); |
139 | 128 | ||
140 | /* 3) run it (and print duration)*/ | 129 | /* 2) run (and print duration) */ |
141 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 130 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
142 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, | 131 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, |
143 | entry->func, task_pid_nr(current)); | 132 | entry->func, task_pid_nr(current)); |
@@ -153,31 +142,25 @@ static void run_one_entry(void) | |||
153 | (long long)ktime_to_ns(delta) >> 10); | 142 | (long long)ktime_to_ns(delta) >> 10); |
154 | } | 143 | } |
155 | 144 | ||
156 | /* 4) remove it from the running queue */ | 145 | /* 3) remove self from the running queue */ |
157 | spin_lock_irqsave(&async_lock, flags); | 146 | spin_lock_irqsave(&async_lock, flags); |
158 | list_del(&entry->list); | 147 | list_del(&entry->list); |
159 | 148 | ||
160 | /* 5) free the entry */ | 149 | /* 4) free the entry */ |
161 | kfree(entry); | 150 | kfree(entry); |
162 | atomic_dec(&entry_count); | 151 | atomic_dec(&entry_count); |
163 | 152 | ||
164 | spin_unlock_irqrestore(&async_lock, flags); | 153 | spin_unlock_irqrestore(&async_lock, flags); |
165 | 154 | ||
166 | /* 6) wake up any waiters. */ | 155 | /* 5) wake up any waiters */ |
167 | wake_up(&async_done); | 156 | wake_up(&async_done); |
168 | return; | ||
169 | |||
170 | out: | ||
171 | spin_unlock_irqrestore(&async_lock, flags); | ||
172 | } | 157 | } |
173 | 158 | ||
174 | |||
175 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) | 159 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) |
176 | { | 160 | { |
177 | struct async_entry *entry; | 161 | struct async_entry *entry; |
178 | unsigned long flags; | 162 | unsigned long flags; |
179 | async_cookie_t newcookie; | 163 | async_cookie_t newcookie; |
180 | |||
181 | 164 | ||
182 | /* allow irq-off callers */ | 165 | /* allow irq-off callers */ |
183 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); | 166 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); |
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
186 | * If we're out of memory or if there's too much work | 169 | * If we're out of memory or if there's too much work |
187 | * pending already, we execute synchronously. | 170 | * pending already, we execute synchronously. |
188 | */ | 171 | */ |
189 | if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { | 172 | if (!entry || atomic_read(&entry_count) > MAX_WORK) { |
190 | kfree(entry); | 173 | kfree(entry); |
191 | spin_lock_irqsave(&async_lock, flags); | 174 | spin_lock_irqsave(&async_lock, flags); |
192 | newcookie = next_cookie++; | 175 | newcookie = next_cookie++; |
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
196 | ptr(data, newcookie); | 179 | ptr(data, newcookie); |
197 | return newcookie; | 180 | return newcookie; |
198 | } | 181 | } |
182 | INIT_WORK(&entry->work, async_run_entry_fn); | ||
199 | entry->func = ptr; | 183 | entry->func = ptr; |
200 | entry->data = data; | 184 | entry->data = data; |
201 | entry->running = running; | 185 | entry->running = running; |
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
205 | list_add_tail(&entry->list, &async_pending); | 189 | list_add_tail(&entry->list, &async_pending); |
206 | atomic_inc(&entry_count); | 190 | atomic_inc(&entry_count); |
207 | spin_unlock_irqrestore(&async_lock, flags); | 191 | spin_unlock_irqrestore(&async_lock, flags); |
208 | wake_up(&async_new); | 192 | |
193 | /* schedule for execution */ | ||
194 | queue_work(system_unbound_wq, &entry->work); | ||
195 | |||
209 | return newcookie; | 196 | return newcookie; |
210 | } | 197 | } |
211 | 198 | ||
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) | |||
312 | async_synchronize_cookie_domain(cookie, &async_running); | 299 | async_synchronize_cookie_domain(cookie, &async_running); |
313 | } | 300 | } |
314 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); | 301 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); |
315 | |||
316 | |||
317 | static int async_thread(void *unused) | ||
318 | { | ||
319 | DECLARE_WAITQUEUE(wq, current); | ||
320 | add_wait_queue(&async_new, &wq); | ||
321 | |||
322 | while (!kthread_should_stop()) { | ||
323 | int ret = HZ; | ||
324 | set_current_state(TASK_INTERRUPTIBLE); | ||
325 | /* | ||
326 | * check the list head without lock.. false positives | ||
327 | * are dealt with inside run_one_entry() while holding | ||
328 | * the lock. | ||
329 | */ | ||
330 | rmb(); | ||
331 | if (!list_empty(&async_pending)) | ||
332 | run_one_entry(); | ||
333 | else | ||
334 | ret = schedule_timeout(HZ); | ||
335 | |||
336 | if (ret == 0) { | ||
337 | /* | ||
338 | * we timed out, this means we as thread are redundant. | ||
339 | * we sign off and die, but we to avoid any races there | ||
340 | * is a last-straw check to see if work snuck in. | ||
341 | */ | ||
342 | atomic_dec(&thread_count); | ||
343 | wmb(); /* manager must see our departure first */ | ||
344 | if (list_empty(&async_pending)) | ||
345 | break; | ||
346 | /* | ||
347 | * woops work came in between us timing out and us | ||
348 | * signing off; we need to stay alive and keep working. | ||
349 | */ | ||
350 | atomic_inc(&thread_count); | ||
351 | } | ||
352 | } | ||
353 | remove_wait_queue(&async_new, &wq); | ||
354 | |||
355 | return 0; | ||
356 | } | ||
357 | |||
358 | static int async_manager_thread(void *unused) | ||
359 | { | ||
360 | DECLARE_WAITQUEUE(wq, current); | ||
361 | add_wait_queue(&async_new, &wq); | ||
362 | |||
363 | while (!kthread_should_stop()) { | ||
364 | int tc, ec; | ||
365 | |||
366 | set_current_state(TASK_INTERRUPTIBLE); | ||
367 | |||
368 | tc = atomic_read(&thread_count); | ||
369 | rmb(); | ||
370 | ec = atomic_read(&entry_count); | ||
371 | |||
372 | while (tc < ec && tc < MAX_THREADS) { | ||
373 | if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", | ||
374 | tc))) { | ||
375 | msleep(100); | ||
376 | continue; | ||
377 | } | ||
378 | atomic_inc(&thread_count); | ||
379 | tc++; | ||
380 | } | ||
381 | |||
382 | schedule(); | ||
383 | } | ||
384 | remove_wait_queue(&async_new, &wq); | ||
385 | |||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | static int __init async_init(void) | ||
390 | { | ||
391 | async_enabled = | ||
392 | !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); | ||
393 | |||
394 | WARN_ON(!async_enabled); | ||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | core_initcall(async_init); | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 8296aa516c5a..d96045789b54 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -56,7 +56,6 @@ | |||
56 | #include <net/netlink.h> | 56 | #include <net/netlink.h> |
57 | #include <linux/skbuff.h> | 57 | #include <linux/skbuff.h> |
58 | #include <linux/netlink.h> | 58 | #include <linux/netlink.h> |
59 | #include <linux/inotify.h> | ||
60 | #include <linux/freezer.h> | 59 | #include <linux/freezer.h> |
61 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
62 | 61 | ||
diff --git a/kernel/audit.h b/kernel/audit.h index 208687be4f30..f7206db4e13d 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex; | |||
103 | extern void audit_free_rule_rcu(struct rcu_head *); | 103 | extern void audit_free_rule_rcu(struct rcu_head *); |
104 | extern struct list_head audit_filter_list[]; | 104 | extern struct list_head audit_filter_list[]; |
105 | 105 | ||
106 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); | ||
107 | |||
106 | /* audit watch functions */ | 108 | /* audit watch functions */ |
107 | extern unsigned long audit_watch_inode(struct audit_watch *watch); | 109 | #ifdef CONFIG_AUDIT_WATCH |
108 | extern dev_t audit_watch_dev(struct audit_watch *watch); | ||
109 | extern void audit_put_watch(struct audit_watch *watch); | 110 | extern void audit_put_watch(struct audit_watch *watch); |
110 | extern void audit_get_watch(struct audit_watch *watch); | 111 | extern void audit_get_watch(struct audit_watch *watch); |
111 | extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); | 112 | extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); |
112 | extern int audit_add_watch(struct audit_krule *krule); | 113 | extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); |
113 | extern void audit_remove_watch(struct audit_watch *watch); | 114 | extern void audit_remove_watch_rule(struct audit_krule *krule); |
114 | extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); | ||
115 | extern void audit_inotify_unregister(struct list_head *in_list); | ||
116 | extern char *audit_watch_path(struct audit_watch *watch); | 115 | extern char *audit_watch_path(struct audit_watch *watch); |
117 | extern struct list_head *audit_watch_rules(struct audit_watch *watch); | 116 | extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); |
118 | 117 | #else | |
119 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, | 118 | #define audit_put_watch(w) {} |
120 | struct audit_watch *watch); | 119 | #define audit_get_watch(w) {} |
120 | #define audit_to_watch(k, p, l, o) (-EINVAL) | ||
121 | #define audit_add_watch(k, l) (-EINVAL) | ||
122 | #define audit_remove_watch_rule(k) BUG() | ||
123 | #define audit_watch_path(w) "" | ||
124 | #define audit_watch_compare(w, i, d) 0 | ||
125 | |||
126 | #endif /* CONFIG_AUDIT_WATCH */ | ||
121 | 127 | ||
122 | #ifdef CONFIG_AUDIT_TREE | 128 | #ifdef CONFIG_AUDIT_TREE |
123 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 129 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 46a57b57a335..7f18d3a4527e 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -1,5 +1,5 @@ | |||
1 | #include "audit.h" | 1 | #include "audit.h" |
2 | #include <linux/inotify.h> | 2 | #include <linux/fsnotify_backend.h> |
3 | #include <linux/namei.h> | 3 | #include <linux/namei.h> |
4 | #include <linux/mount.h> | 4 | #include <linux/mount.h> |
5 | #include <linux/kthread.h> | 5 | #include <linux/kthread.h> |
@@ -22,7 +22,7 @@ struct audit_tree { | |||
22 | 22 | ||
23 | struct audit_chunk { | 23 | struct audit_chunk { |
24 | struct list_head hash; | 24 | struct list_head hash; |
25 | struct inotify_watch watch; | 25 | struct fsnotify_mark mark; |
26 | struct list_head trees; /* with root here */ | 26 | struct list_head trees; /* with root here */ |
27 | int dead; | 27 | int dead; |
28 | int count; | 28 | int count; |
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list); | |||
59 | * tree is refcounted; one reference for "some rules on rules_list refer to | 59 | * tree is refcounted; one reference for "some rules on rules_list refer to |
60 | * it", one for each chunk with pointer to it. | 60 | * it", one for each chunk with pointer to it. |
61 | * | 61 | * |
62 | * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount | 62 | * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount |
63 | * of watch contributes 1 to .refs). | 63 | * of watch contributes 1 to .refs). |
64 | * | 64 | * |
65 | * node.index allows to get from node.list to containing chunk. | 65 | * node.index allows to get from node.list to containing chunk. |
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list); | |||
68 | * that makes a difference. Some. | 68 | * that makes a difference. Some. |
69 | */ | 69 | */ |
70 | 70 | ||
71 | static struct inotify_handle *rtree_ih; | 71 | static struct fsnotify_group *audit_tree_group; |
72 | 72 | ||
73 | static struct audit_tree *alloc_tree(const char *s) | 73 | static struct audit_tree *alloc_tree(const char *s) |
74 | { | 74 | { |
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree) | |||
111 | return tree->pathname; | 111 | return tree->pathname; |
112 | } | 112 | } |
113 | 113 | ||
114 | static struct audit_chunk *alloc_chunk(int count) | ||
115 | { | ||
116 | struct audit_chunk *chunk; | ||
117 | size_t size; | ||
118 | int i; | ||
119 | |||
120 | size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); | ||
121 | chunk = kzalloc(size, GFP_KERNEL); | ||
122 | if (!chunk) | ||
123 | return NULL; | ||
124 | |||
125 | INIT_LIST_HEAD(&chunk->hash); | ||
126 | INIT_LIST_HEAD(&chunk->trees); | ||
127 | chunk->count = count; | ||
128 | atomic_long_set(&chunk->refs, 1); | ||
129 | for (i = 0; i < count; i++) { | ||
130 | INIT_LIST_HEAD(&chunk->owners[i].list); | ||
131 | chunk->owners[i].index = i; | ||
132 | } | ||
133 | inotify_init_watch(&chunk->watch); | ||
134 | return chunk; | ||
135 | } | ||
136 | |||
137 | static void free_chunk(struct audit_chunk *chunk) | 114 | static void free_chunk(struct audit_chunk *chunk) |
138 | { | 115 | { |
139 | int i; | 116 | int i; |
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu) | |||
157 | audit_put_chunk(chunk); | 134 | audit_put_chunk(chunk); |
158 | } | 135 | } |
159 | 136 | ||
137 | static void audit_tree_destroy_watch(struct fsnotify_mark *entry) | ||
138 | { | ||
139 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); | ||
140 | call_rcu(&chunk->head, __put_chunk); | ||
141 | } | ||
142 | |||
143 | static struct audit_chunk *alloc_chunk(int count) | ||
144 | { | ||
145 | struct audit_chunk *chunk; | ||
146 | size_t size; | ||
147 | int i; | ||
148 | |||
149 | size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); | ||
150 | chunk = kzalloc(size, GFP_KERNEL); | ||
151 | if (!chunk) | ||
152 | return NULL; | ||
153 | |||
154 | INIT_LIST_HEAD(&chunk->hash); | ||
155 | INIT_LIST_HEAD(&chunk->trees); | ||
156 | chunk->count = count; | ||
157 | atomic_long_set(&chunk->refs, 1); | ||
158 | for (i = 0; i < count; i++) { | ||
159 | INIT_LIST_HEAD(&chunk->owners[i].list); | ||
160 | chunk->owners[i].index = i; | ||
161 | } | ||
162 | fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); | ||
163 | return chunk; | ||
164 | } | ||
165 | |||
160 | enum {HASH_SIZE = 128}; | 166 | enum {HASH_SIZE = 128}; |
161 | static struct list_head chunk_hash_heads[HASH_SIZE]; | 167 | static struct list_head chunk_hash_heads[HASH_SIZE]; |
162 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); | 168 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); |
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode) | |||
167 | return chunk_hash_heads + n % HASH_SIZE; | 173 | return chunk_hash_heads + n % HASH_SIZE; |
168 | } | 174 | } |
169 | 175 | ||
170 | /* hash_lock is held by caller */ | 176 | /* hash_lock & entry->lock is held by caller */ |
171 | static void insert_hash(struct audit_chunk *chunk) | 177 | static void insert_hash(struct audit_chunk *chunk) |
172 | { | 178 | { |
173 | struct list_head *list = chunk_hash(chunk->watch.inode); | 179 | struct fsnotify_mark *entry = &chunk->mark; |
180 | struct list_head *list; | ||
181 | |||
182 | if (!entry->i.inode) | ||
183 | return; | ||
184 | list = chunk_hash(entry->i.inode); | ||
174 | list_add_rcu(&chunk->hash, list); | 185 | list_add_rcu(&chunk->hash, list); |
175 | } | 186 | } |
176 | 187 | ||
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
181 | struct audit_chunk *p; | 192 | struct audit_chunk *p; |
182 | 193 | ||
183 | list_for_each_entry_rcu(p, list, hash) { | 194 | list_for_each_entry_rcu(p, list, hash) { |
184 | if (p->watch.inode == inode) { | 195 | /* mark.inode may have gone NULL, but who cares? */ |
196 | if (p->mark.i.inode == inode) { | ||
185 | atomic_long_inc(&p->refs); | 197 | atomic_long_inc(&p->refs); |
186 | return p; | 198 | return p; |
187 | } | 199 | } |
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p) | |||
210 | static void untag_chunk(struct node *p) | 222 | static void untag_chunk(struct node *p) |
211 | { | 223 | { |
212 | struct audit_chunk *chunk = find_chunk(p); | 224 | struct audit_chunk *chunk = find_chunk(p); |
225 | struct fsnotify_mark *entry = &chunk->mark; | ||
213 | struct audit_chunk *new; | 226 | struct audit_chunk *new; |
214 | struct audit_tree *owner; | 227 | struct audit_tree *owner; |
215 | int size = chunk->count - 1; | 228 | int size = chunk->count - 1; |
216 | int i, j; | 229 | int i, j; |
217 | 230 | ||
218 | if (!pin_inotify_watch(&chunk->watch)) { | 231 | fsnotify_get_mark(entry); |
219 | /* | ||
220 | * Filesystem is shutting down; all watches are getting | ||
221 | * evicted, just take it off the node list for this | ||
222 | * tree and let the eviction logics take care of the | ||
223 | * rest. | ||
224 | */ | ||
225 | owner = p->owner; | ||
226 | if (owner->root == chunk) { | ||
227 | list_del_init(&owner->same_root); | ||
228 | owner->root = NULL; | ||
229 | } | ||
230 | list_del_init(&p->list); | ||
231 | p->owner = NULL; | ||
232 | put_tree(owner); | ||
233 | return; | ||
234 | } | ||
235 | 232 | ||
236 | spin_unlock(&hash_lock); | 233 | spin_unlock(&hash_lock); |
237 | 234 | ||
238 | /* | 235 | spin_lock(&entry->lock); |
239 | * pin_inotify_watch() succeeded, so the watch won't go away | 236 | if (chunk->dead || !entry->i.inode) { |
240 | * from under us. | 237 | spin_unlock(&entry->lock); |
241 | */ | ||
242 | mutex_lock(&chunk->watch.inode->inotify_mutex); | ||
243 | if (chunk->dead) { | ||
244 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | ||
245 | goto out; | 238 | goto out; |
246 | } | 239 | } |
247 | 240 | ||
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p) | |||
256 | list_del_init(&p->list); | 249 | list_del_init(&p->list); |
257 | list_del_rcu(&chunk->hash); | 250 | list_del_rcu(&chunk->hash); |
258 | spin_unlock(&hash_lock); | 251 | spin_unlock(&hash_lock); |
259 | inotify_evict_watch(&chunk->watch); | 252 | spin_unlock(&entry->lock); |
260 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 253 | fsnotify_destroy_mark(entry); |
261 | put_inotify_watch(&chunk->watch); | 254 | fsnotify_put_mark(entry); |
262 | goto out; | 255 | goto out; |
263 | } | 256 | } |
264 | 257 | ||
265 | new = alloc_chunk(size); | 258 | new = alloc_chunk(size); |
266 | if (!new) | 259 | if (!new) |
267 | goto Fallback; | 260 | goto Fallback; |
268 | if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { | 261 | fsnotify_duplicate_mark(&new->mark, entry); |
262 | if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { | ||
269 | free_chunk(new); | 263 | free_chunk(new); |
270 | goto Fallback; | 264 | goto Fallback; |
271 | } | 265 | } |
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p) | |||
298 | list_for_each_entry(owner, &new->trees, same_root) | 292 | list_for_each_entry(owner, &new->trees, same_root) |
299 | owner->root = new; | 293 | owner->root = new; |
300 | spin_unlock(&hash_lock); | 294 | spin_unlock(&hash_lock); |
301 | inotify_evict_watch(&chunk->watch); | 295 | spin_unlock(&entry->lock); |
302 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 296 | fsnotify_destroy_mark(entry); |
303 | put_inotify_watch(&chunk->watch); | 297 | fsnotify_put_mark(entry); |
304 | goto out; | 298 | goto out; |
305 | 299 | ||
306 | Fallback: | 300 | Fallback: |
@@ -314,31 +308,33 @@ Fallback: | |||
314 | p->owner = NULL; | 308 | p->owner = NULL; |
315 | put_tree(owner); | 309 | put_tree(owner); |
316 | spin_unlock(&hash_lock); | 310 | spin_unlock(&hash_lock); |
317 | mutex_unlock(&chunk->watch.inode->inotify_mutex); | 311 | spin_unlock(&entry->lock); |
318 | out: | 312 | out: |
319 | unpin_inotify_watch(&chunk->watch); | 313 | fsnotify_put_mark(entry); |
320 | spin_lock(&hash_lock); | 314 | spin_lock(&hash_lock); |
321 | } | 315 | } |
322 | 316 | ||
323 | static int create_chunk(struct inode *inode, struct audit_tree *tree) | 317 | static int create_chunk(struct inode *inode, struct audit_tree *tree) |
324 | { | 318 | { |
319 | struct fsnotify_mark *entry; | ||
325 | struct audit_chunk *chunk = alloc_chunk(1); | 320 | struct audit_chunk *chunk = alloc_chunk(1); |
326 | if (!chunk) | 321 | if (!chunk) |
327 | return -ENOMEM; | 322 | return -ENOMEM; |
328 | 323 | ||
329 | if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { | 324 | entry = &chunk->mark; |
325 | if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { | ||
330 | free_chunk(chunk); | 326 | free_chunk(chunk); |
331 | return -ENOSPC; | 327 | return -ENOSPC; |
332 | } | 328 | } |
333 | 329 | ||
334 | mutex_lock(&inode->inotify_mutex); | 330 | spin_lock(&entry->lock); |
335 | spin_lock(&hash_lock); | 331 | spin_lock(&hash_lock); |
336 | if (tree->goner) { | 332 | if (tree->goner) { |
337 | spin_unlock(&hash_lock); | 333 | spin_unlock(&hash_lock); |
338 | chunk->dead = 1; | 334 | chunk->dead = 1; |
339 | inotify_evict_watch(&chunk->watch); | 335 | spin_unlock(&entry->lock); |
340 | mutex_unlock(&inode->inotify_mutex); | 336 | fsnotify_destroy_mark(entry); |
341 | put_inotify_watch(&chunk->watch); | 337 | fsnotify_put_mark(entry); |
342 | return 0; | 338 | return 0; |
343 | } | 339 | } |
344 | chunk->owners[0].index = (1U << 31); | 340 | chunk->owners[0].index = (1U << 31); |
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) | |||
351 | } | 347 | } |
352 | insert_hash(chunk); | 348 | insert_hash(chunk); |
353 | spin_unlock(&hash_lock); | 349 | spin_unlock(&hash_lock); |
354 | mutex_unlock(&inode->inotify_mutex); | 350 | spin_unlock(&entry->lock); |
355 | return 0; | 351 | return 0; |
356 | } | 352 | } |
357 | 353 | ||
358 | /* the first tagged inode becomes root of tree */ | 354 | /* the first tagged inode becomes root of tree */ |
359 | static int tag_chunk(struct inode *inode, struct audit_tree *tree) | 355 | static int tag_chunk(struct inode *inode, struct audit_tree *tree) |
360 | { | 356 | { |
361 | struct inotify_watch *watch; | 357 | struct fsnotify_mark *old_entry, *chunk_entry; |
362 | struct audit_tree *owner; | 358 | struct audit_tree *owner; |
363 | struct audit_chunk *chunk, *old; | 359 | struct audit_chunk *chunk, *old; |
364 | struct node *p; | 360 | struct node *p; |
365 | int n; | 361 | int n; |
366 | 362 | ||
367 | if (inotify_find_watch(rtree_ih, inode, &watch) < 0) | 363 | old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); |
364 | if (!old_entry) | ||
368 | return create_chunk(inode, tree); | 365 | return create_chunk(inode, tree); |
369 | 366 | ||
370 | old = container_of(watch, struct audit_chunk, watch); | 367 | old = container_of(old_entry, struct audit_chunk, mark); |
371 | 368 | ||
372 | /* are we already there? */ | 369 | /* are we already there? */ |
373 | spin_lock(&hash_lock); | 370 | spin_lock(&hash_lock); |
374 | for (n = 0; n < old->count; n++) { | 371 | for (n = 0; n < old->count; n++) { |
375 | if (old->owners[n].owner == tree) { | 372 | if (old->owners[n].owner == tree) { |
376 | spin_unlock(&hash_lock); | 373 | spin_unlock(&hash_lock); |
377 | put_inotify_watch(&old->watch); | 374 | fsnotify_put_mark(old_entry); |
378 | return 0; | 375 | return 0; |
379 | } | 376 | } |
380 | } | 377 | } |
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
382 | 379 | ||
383 | chunk = alloc_chunk(old->count + 1); | 380 | chunk = alloc_chunk(old->count + 1); |
384 | if (!chunk) { | 381 | if (!chunk) { |
385 | put_inotify_watch(&old->watch); | 382 | fsnotify_put_mark(old_entry); |
386 | return -ENOMEM; | 383 | return -ENOMEM; |
387 | } | 384 | } |
388 | 385 | ||
389 | mutex_lock(&inode->inotify_mutex); | 386 | chunk_entry = &chunk->mark; |
390 | if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { | 387 | |
391 | mutex_unlock(&inode->inotify_mutex); | 388 | spin_lock(&old_entry->lock); |
392 | put_inotify_watch(&old->watch); | 389 | if (!old_entry->i.inode) { |
390 | /* old_entry is being shot, lets just lie */ | ||
391 | spin_unlock(&old_entry->lock); | ||
392 | fsnotify_put_mark(old_entry); | ||
393 | free_chunk(chunk); | 393 | free_chunk(chunk); |
394 | return -ENOENT; | ||
395 | } | ||
396 | |||
397 | fsnotify_duplicate_mark(chunk_entry, old_entry); | ||
398 | if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { | ||
399 | spin_unlock(&old_entry->lock); | ||
400 | free_chunk(chunk); | ||
401 | fsnotify_put_mark(old_entry); | ||
394 | return -ENOSPC; | 402 | return -ENOSPC; |
395 | } | 403 | } |
404 | |||
405 | /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ | ||
406 | spin_lock(&chunk_entry->lock); | ||
396 | spin_lock(&hash_lock); | 407 | spin_lock(&hash_lock); |
408 | |||
409 | /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ | ||
397 | if (tree->goner) { | 410 | if (tree->goner) { |
398 | spin_unlock(&hash_lock); | 411 | spin_unlock(&hash_lock); |
399 | chunk->dead = 1; | 412 | chunk->dead = 1; |
400 | inotify_evict_watch(&chunk->watch); | 413 | spin_unlock(&chunk_entry->lock); |
401 | mutex_unlock(&inode->inotify_mutex); | 414 | spin_unlock(&old_entry->lock); |
402 | put_inotify_watch(&old->watch); | 415 | |
403 | put_inotify_watch(&chunk->watch); | 416 | fsnotify_destroy_mark(chunk_entry); |
417 | |||
418 | fsnotify_put_mark(chunk_entry); | ||
419 | fsnotify_put_mark(old_entry); | ||
404 | return 0; | 420 | return 0; |
405 | } | 421 | } |
406 | list_replace_init(&old->trees, &chunk->trees); | 422 | list_replace_init(&old->trees, &chunk->trees); |
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) | |||
426 | list_add(&tree->same_root, &chunk->trees); | 442 | list_add(&tree->same_root, &chunk->trees); |
427 | } | 443 | } |
428 | spin_unlock(&hash_lock); | 444 | spin_unlock(&hash_lock); |
429 | inotify_evict_watch(&old->watch); | 445 | spin_unlock(&chunk_entry->lock); |
430 | mutex_unlock(&inode->inotify_mutex); | 446 | spin_unlock(&old_entry->lock); |
431 | put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ | 447 | fsnotify_destroy_mark(old_entry); |
432 | put_inotify_watch(&old->watch); /* and kill it */ | 448 | fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ |
449 | fsnotify_put_mark(old_entry); /* and kill it */ | ||
433 | return 0; | 450 | return 0; |
434 | } | 451 | } |
435 | 452 | ||
@@ -584,7 +601,9 @@ void audit_trim_trees(void) | |||
584 | 601 | ||
585 | spin_lock(&hash_lock); | 602 | spin_lock(&hash_lock); |
586 | list_for_each_entry(node, &tree->chunks, list) { | 603 | list_for_each_entry(node, &tree->chunks, list) { |
587 | struct inode *inode = find_chunk(node)->watch.inode; | 604 | struct audit_chunk *chunk = find_chunk(node); |
605 | /* this could be NULL if the watch is dieing else where... */ | ||
606 | struct inode *inode = chunk->mark.i.inode; | ||
588 | node->index |= 1U<<31; | 607 | node->index |= 1U<<31; |
589 | if (iterate_mounts(compare_root, inode, root_mnt)) | 608 | if (iterate_mounts(compare_root, inode, root_mnt)) |
590 | node->index &= ~(1U<<31); | 609 | node->index &= ~(1U<<31); |
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list) | |||
846 | * Here comes the stuff asynchronous to auditctl operations | 865 | * Here comes the stuff asynchronous to auditctl operations |
847 | */ | 866 | */ |
848 | 867 | ||
849 | /* inode->inotify_mutex is locked */ | ||
850 | static void evict_chunk(struct audit_chunk *chunk) | 868 | static void evict_chunk(struct audit_chunk *chunk) |
851 | { | 869 | { |
852 | struct audit_tree *owner; | 870 | struct audit_tree *owner; |
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk) | |||
885 | mutex_unlock(&audit_filter_mutex); | 903 | mutex_unlock(&audit_filter_mutex); |
886 | } | 904 | } |
887 | 905 | ||
888 | static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, | 906 | static int audit_tree_handle_event(struct fsnotify_group *group, |
889 | u32 cookie, const char *dname, struct inode *inode) | 907 | struct fsnotify_mark *inode_mark, |
908 | struct fsnotify_mark *vfsmonut_mark, | ||
909 | struct fsnotify_event *event) | ||
910 | { | ||
911 | BUG(); | ||
912 | return -EOPNOTSUPP; | ||
913 | } | ||
914 | |||
915 | static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) | ||
890 | { | 916 | { |
891 | struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); | 917 | struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); |
892 | 918 | ||
893 | if (mask & IN_IGNORED) { | 919 | evict_chunk(chunk); |
894 | evict_chunk(chunk); | 920 | fsnotify_put_mark(entry); |
895 | put_inotify_watch(watch); | ||
896 | } | ||
897 | } | 921 | } |
898 | 922 | ||
899 | static void destroy_watch(struct inotify_watch *watch) | 923 | static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, |
924 | struct fsnotify_mark *inode_mark, | ||
925 | struct fsnotify_mark *vfsmount_mark, | ||
926 | __u32 mask, void *data, int data_type) | ||
900 | { | 927 | { |
901 | struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); | 928 | return false; |
902 | call_rcu(&chunk->head, __put_chunk); | ||
903 | } | 929 | } |
904 | 930 | ||
905 | static const struct inotify_operations rtree_inotify_ops = { | 931 | static const struct fsnotify_ops audit_tree_ops = { |
906 | .handle_event = handle_event, | 932 | .handle_event = audit_tree_handle_event, |
907 | .destroy_watch = destroy_watch, | 933 | .should_send_event = audit_tree_send_event, |
934 | .free_group_priv = NULL, | ||
935 | .free_event_priv = NULL, | ||
936 | .freeing_mark = audit_tree_freeing_mark, | ||
908 | }; | 937 | }; |
909 | 938 | ||
910 | static int __init audit_tree_init(void) | 939 | static int __init audit_tree_init(void) |
911 | { | 940 | { |
912 | int i; | 941 | int i; |
913 | 942 | ||
914 | rtree_ih = inotify_init(&rtree_inotify_ops); | 943 | audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); |
915 | if (IS_ERR(rtree_ih)) | 944 | if (IS_ERR(audit_tree_group)) |
916 | audit_panic("cannot initialize inotify handle for rectree watches"); | 945 | audit_panic("cannot initialize fsnotify group for rectree watches"); |
917 | 946 | ||
918 | for (i = 0; i < HASH_SIZE; i++) | 947 | for (i = 0; i < HASH_SIZE; i++) |
919 | INIT_LIST_HEAD(&chunk_hash_heads[i]); | 948 | INIT_LIST_HEAD(&chunk_hash_heads[i]); |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8df43696f4ba..f0c9b2e7542d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
@@ -24,18 +24,18 @@ | |||
24 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
25 | #include <linux/mutex.h> | 25 | #include <linux/mutex.h> |
26 | #include <linux/fs.h> | 26 | #include <linux/fs.h> |
27 | #include <linux/fsnotify_backend.h> | ||
27 | #include <linux/namei.h> | 28 | #include <linux/namei.h> |
28 | #include <linux/netlink.h> | 29 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | 30 | #include <linux/sched.h> |
30 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
31 | #include <linux/inotify.h> | ||
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include "audit.h" | 33 | #include "audit.h" |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * Reference counting: | 36 | * Reference counting: |
37 | * | 37 | * |
38 | * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED | 38 | * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED |
39 | * event. Each audit_watch holds a reference to its associated parent. | 39 | * event. Each audit_watch holds a reference to its associated parent. |
40 | * | 40 | * |
41 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to | 41 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to |
@@ -51,40 +51,61 @@ struct audit_watch { | |||
51 | unsigned long ino; /* associated inode number */ | 51 | unsigned long ino; /* associated inode number */ |
52 | struct audit_parent *parent; /* associated parent */ | 52 | struct audit_parent *parent; /* associated parent */ |
53 | struct list_head wlist; /* entry in parent->watches list */ | 53 | struct list_head wlist; /* entry in parent->watches list */ |
54 | struct list_head rules; /* associated rules */ | 54 | struct list_head rules; /* anchor for krule->rlist */ |
55 | }; | 55 | }; |
56 | 56 | ||
57 | struct audit_parent { | 57 | struct audit_parent { |
58 | struct list_head ilist; /* entry in inotify registration list */ | 58 | struct list_head watches; /* anchor for audit_watch->wlist */ |
59 | struct list_head watches; /* associated watches */ | 59 | struct fsnotify_mark mark; /* fsnotify mark on the inode */ |
60 | struct inotify_watch wdata; /* inotify watch data */ | ||
61 | unsigned flags; /* status flags */ | ||
62 | }; | 60 | }; |
63 | 61 | ||
64 | /* Inotify handle. */ | 62 | /* fsnotify handle. */ |
65 | struct inotify_handle *audit_ih; | 63 | struct fsnotify_group *audit_watch_group; |
66 | 64 | ||
67 | /* | 65 | /* fsnotify events we care about. */ |
68 | * audit_parent status flags: | 66 | #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ |
69 | * | 67 | FS_MOVE_SELF | FS_EVENT_ON_CHILD) |
70 | * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to | ||
71 | * a filesystem event to ensure we're adding audit watches to a valid parent. | ||
72 | * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot | ||
73 | * receive them while we have nameidata, but must be used for IN_MOVE_SELF which | ||
74 | * we can receive while holding nameidata. | ||
75 | */ | ||
76 | #define AUDIT_PARENT_INVALID 0x001 | ||
77 | 68 | ||
78 | /* Inotify events we care about. */ | 69 | static void audit_free_parent(struct audit_parent *parent) |
79 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | 70 | { |
71 | WARN_ON(!list_empty(&parent->watches)); | ||
72 | kfree(parent); | ||
73 | } | ||
80 | 74 | ||
81 | static void audit_free_parent(struct inotify_watch *i_watch) | 75 | static void audit_watch_free_mark(struct fsnotify_mark *entry) |
82 | { | 76 | { |
83 | struct audit_parent *parent; | 77 | struct audit_parent *parent; |
84 | 78 | ||
85 | parent = container_of(i_watch, struct audit_parent, wdata); | 79 | parent = container_of(entry, struct audit_parent, mark); |
86 | WARN_ON(!list_empty(&parent->watches)); | 80 | audit_free_parent(parent); |
87 | kfree(parent); | 81 | } |
82 | |||
83 | static void audit_get_parent(struct audit_parent *parent) | ||
84 | { | ||
85 | if (likely(parent)) | ||
86 | fsnotify_get_mark(&parent->mark); | ||
87 | } | ||
88 | |||
89 | static void audit_put_parent(struct audit_parent *parent) | ||
90 | { | ||
91 | if (likely(parent)) | ||
92 | fsnotify_put_mark(&parent->mark); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Find and return the audit_parent on the given inode. If found a reference | ||
97 | * is taken on this parent. | ||
98 | */ | ||
99 | static inline struct audit_parent *audit_find_parent(struct inode *inode) | ||
100 | { | ||
101 | struct audit_parent *parent = NULL; | ||
102 | struct fsnotify_mark *entry; | ||
103 | |||
104 | entry = fsnotify_find_inode_mark(audit_watch_group, inode); | ||
105 | if (entry) | ||
106 | parent = container_of(entry, struct audit_parent, mark); | ||
107 | |||
108 | return parent; | ||
88 | } | 109 | } |
89 | 110 | ||
90 | void audit_get_watch(struct audit_watch *watch) | 111 | void audit_get_watch(struct audit_watch *watch) |
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch) | |||
105 | void audit_remove_watch(struct audit_watch *watch) | 126 | void audit_remove_watch(struct audit_watch *watch) |
106 | { | 127 | { |
107 | list_del(&watch->wlist); | 128 | list_del(&watch->wlist); |
108 | put_inotify_watch(&watch->parent->wdata); | 129 | audit_put_parent(watch->parent); |
109 | watch->parent = NULL; | 130 | watch->parent = NULL; |
110 | audit_put_watch(watch); /* match initial get */ | 131 | audit_put_watch(watch); /* match initial get */ |
111 | } | 132 | } |
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch) | |||
115 | return watch->path; | 136 | return watch->path; |
116 | } | 137 | } |
117 | 138 | ||
118 | struct list_head *audit_watch_rules(struct audit_watch *watch) | 139 | int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) |
119 | { | ||
120 | return &watch->rules; | ||
121 | } | ||
122 | |||
123 | unsigned long audit_watch_inode(struct audit_watch *watch) | ||
124 | { | 140 | { |
125 | return watch->ino; | 141 | return (watch->ino != (unsigned long)-1) && |
126 | } | 142 | (watch->ino == ino) && |
127 | 143 | (watch->dev == dev); | |
128 | dev_t audit_watch_dev(struct audit_watch *watch) | ||
129 | { | ||
130 | return watch->dev; | ||
131 | } | 144 | } |
132 | 145 | ||
133 | /* Initialize a parent watch entry. */ | 146 | /* Initialize a parent watch entry. */ |
134 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | 147 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) |
135 | { | 148 | { |
149 | struct inode *inode = ndp->path.dentry->d_inode; | ||
136 | struct audit_parent *parent; | 150 | struct audit_parent *parent; |
137 | s32 wd; | 151 | int ret; |
138 | 152 | ||
139 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); | 153 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); |
140 | if (unlikely(!parent)) | 154 | if (unlikely(!parent)) |
141 | return ERR_PTR(-ENOMEM); | 155 | return ERR_PTR(-ENOMEM); |
142 | 156 | ||
143 | INIT_LIST_HEAD(&parent->watches); | 157 | INIT_LIST_HEAD(&parent->watches); |
144 | parent->flags = 0; | 158 | |
145 | 159 | fsnotify_init_mark(&parent->mark, audit_watch_free_mark); | |
146 | inotify_init_watch(&parent->wdata); | 160 | parent->mark.mask = AUDIT_FS_WATCH; |
147 | /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ | 161 | ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); |
148 | get_inotify_watch(&parent->wdata); | 162 | if (ret < 0) { |
149 | wd = inotify_add_watch(audit_ih, &parent->wdata, | 163 | audit_free_parent(parent); |
150 | ndp->path.dentry->d_inode, AUDIT_IN_WATCH); | 164 | return ERR_PTR(ret); |
151 | if (wd < 0) { | ||
152 | audit_free_parent(&parent->wdata); | ||
153 | return ERR_PTR(wd); | ||
154 | } | 165 | } |
155 | 166 | ||
156 | return parent; | 167 | return parent; |
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) | |||
179 | { | 190 | { |
180 | struct audit_watch *watch; | 191 | struct audit_watch *watch; |
181 | 192 | ||
182 | if (!audit_ih) | 193 | if (!audit_watch_group) |
183 | return -EOPNOTSUPP; | 194 | return -EOPNOTSUPP; |
184 | 195 | ||
185 | if (path[0] != '/' || path[len-1] == '/' || | 196 | if (path[0] != '/' || path[len-1] == '/' || |
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) | |||
217 | 228 | ||
218 | new->dev = old->dev; | 229 | new->dev = old->dev; |
219 | new->ino = old->ino; | 230 | new->ino = old->ino; |
220 | get_inotify_watch(&old->parent->wdata); | 231 | audit_get_parent(old->parent); |
221 | new->parent = old->parent; | 232 | new->parent = old->parent; |
222 | 233 | ||
223 | out: | 234 | out: |
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent, | |||
251 | struct audit_entry *oentry, *nentry; | 262 | struct audit_entry *oentry, *nentry; |
252 | 263 | ||
253 | mutex_lock(&audit_filter_mutex); | 264 | mutex_lock(&audit_filter_mutex); |
265 | /* Run all of the watches on this parent looking for the one that | ||
266 | * matches the given dname */ | ||
254 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | 267 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { |
255 | if (audit_compare_dname_path(dname, owatch->path, NULL)) | 268 | if (audit_compare_dname_path(dname, owatch->path, NULL)) |
256 | continue; | 269 | continue; |
257 | 270 | ||
258 | /* If the update involves invalidating rules, do the inode-based | 271 | /* If the update involves invalidating rules, do the inode-based |
259 | * filtering now, so we don't omit records. */ | 272 | * filtering now, so we don't omit records. */ |
260 | if (invalidating && current->audit_context) | 273 | if (invalidating && !audit_dummy_context()) |
261 | audit_filter_inodes(current, current->audit_context); | 274 | audit_filter_inodes(current, current->audit_context); |
262 | 275 | ||
276 | /* updating ino will likely change which audit_hash_list we | ||
277 | * are on so we need a new watch for the new list */ | ||
263 | nwatch = audit_dupe_watch(owatch); | 278 | nwatch = audit_dupe_watch(owatch); |
264 | if (IS_ERR(nwatch)) { | 279 | if (IS_ERR(nwatch)) { |
265 | mutex_unlock(&audit_filter_mutex); | 280 | mutex_unlock(&audit_filter_mutex); |
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent, | |||
275 | list_del(&oentry->rule.rlist); | 290 | list_del(&oentry->rule.rlist); |
276 | list_del_rcu(&oentry->list); | 291 | list_del_rcu(&oentry->list); |
277 | 292 | ||
278 | nentry = audit_dupe_rule(&oentry->rule, nwatch); | 293 | nentry = audit_dupe_rule(&oentry->rule); |
279 | if (IS_ERR(nentry)) { | 294 | if (IS_ERR(nentry)) { |
280 | list_del(&oentry->rule.list); | 295 | list_del(&oentry->rule.list); |
281 | audit_panic("error updating watch, removing"); | 296 | audit_panic("error updating watch, removing"); |
282 | } else { | 297 | } else { |
283 | int h = audit_hash_ino((u32)ino); | 298 | int h = audit_hash_ino((u32)ino); |
299 | |||
300 | /* | ||
301 | * nentry->rule.watch == oentry->rule.watch so | ||
302 | * we must drop that reference and set it to our | ||
303 | * new watch. | ||
304 | */ | ||
305 | audit_put_watch(nentry->rule.watch); | ||
306 | audit_get_watch(nwatch); | ||
307 | nentry->rule.watch = nwatch; | ||
284 | list_add(&nentry->rule.rlist, &nwatch->rules); | 308 | list_add(&nentry->rule.rlist, &nwatch->rules); |
285 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); | 309 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); |
286 | list_replace(&oentry->rule.list, | 310 | list_replace(&oentry->rule.list, |
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
312 | struct audit_entry *e; | 336 | struct audit_entry *e; |
313 | 337 | ||
314 | mutex_lock(&audit_filter_mutex); | 338 | mutex_lock(&audit_filter_mutex); |
315 | parent->flags |= AUDIT_PARENT_INVALID; | ||
316 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 339 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
317 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 340 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
318 | e = container_of(r, struct audit_entry, rule); | 341 | e = container_of(r, struct audit_entry, rule); |
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
325 | audit_remove_watch(w); | 348 | audit_remove_watch(w); |
326 | } | 349 | } |
327 | mutex_unlock(&audit_filter_mutex); | 350 | mutex_unlock(&audit_filter_mutex); |
328 | } | ||
329 | |||
330 | /* Unregister inotify watches for parents on in_list. | ||
331 | * Generates an IN_IGNORED event. */ | ||
332 | void audit_inotify_unregister(struct list_head *in_list) | ||
333 | { | ||
334 | struct audit_parent *p, *n; | ||
335 | 351 | ||
336 | list_for_each_entry_safe(p, n, in_list, ilist) { | 352 | fsnotify_destroy_mark(&parent->mark); |
337 | list_del(&p->ilist); | ||
338 | inotify_rm_watch(audit_ih, &p->wdata); | ||
339 | /* the unpin matching the pin in audit_do_del_rule() */ | ||
340 | unpin_inotify_watch(&p->wdata); | ||
341 | } | ||
342 | } | 353 | } |
343 | 354 | ||
344 | /* Get path information necessary for adding watches. */ | 355 | /* Get path information necessary for adding watches. */ |
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | |||
389 | } | 400 | } |
390 | } | 401 | } |
391 | 402 | ||
392 | /* Associate the given rule with an existing parent inotify_watch. | 403 | /* Associate the given rule with an existing parent. |
393 | * Caller must hold audit_filter_mutex. */ | 404 | * Caller must hold audit_filter_mutex. */ |
394 | static void audit_add_to_parent(struct audit_krule *krule, | 405 | static void audit_add_to_parent(struct audit_krule *krule, |
395 | struct audit_parent *parent) | 406 | struct audit_parent *parent) |
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
397 | struct audit_watch *w, *watch = krule->watch; | 408 | struct audit_watch *w, *watch = krule->watch; |
398 | int watch_found = 0; | 409 | int watch_found = 0; |
399 | 410 | ||
411 | BUG_ON(!mutex_is_locked(&audit_filter_mutex)); | ||
412 | |||
400 | list_for_each_entry(w, &parent->watches, wlist) { | 413 | list_for_each_entry(w, &parent->watches, wlist) { |
401 | if (strcmp(watch->path, w->path)) | 414 | if (strcmp(watch->path, w->path)) |
402 | continue; | 415 | continue; |
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
413 | } | 426 | } |
414 | 427 | ||
415 | if (!watch_found) { | 428 | if (!watch_found) { |
416 | get_inotify_watch(&parent->wdata); | 429 | audit_get_parent(parent); |
417 | watch->parent = parent; | 430 | watch->parent = parent; |
418 | 431 | ||
419 | list_add(&watch->wlist, &parent->watches); | 432 | list_add(&watch->wlist, &parent->watches); |
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule, | |||
423 | 436 | ||
424 | /* Find a matching watch entry, or add this one. | 437 | /* Find a matching watch entry, or add this one. |
425 | * Caller must hold audit_filter_mutex. */ | 438 | * Caller must hold audit_filter_mutex. */ |
426 | int audit_add_watch(struct audit_krule *krule) | 439 | int audit_add_watch(struct audit_krule *krule, struct list_head **list) |
427 | { | 440 | { |
428 | struct audit_watch *watch = krule->watch; | 441 | struct audit_watch *watch = krule->watch; |
429 | struct inotify_watch *i_watch; | ||
430 | struct audit_parent *parent; | 442 | struct audit_parent *parent; |
431 | struct nameidata *ndp = NULL, *ndw = NULL; | 443 | struct nameidata *ndp = NULL, *ndw = NULL; |
432 | int ret = 0; | 444 | int h, ret = 0; |
433 | 445 | ||
434 | mutex_unlock(&audit_filter_mutex); | 446 | mutex_unlock(&audit_filter_mutex); |
435 | 447 | ||
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule) | |||
441 | goto error; | 453 | goto error; |
442 | } | 454 | } |
443 | 455 | ||
456 | mutex_lock(&audit_filter_mutex); | ||
457 | |||
444 | /* update watch filter fields */ | 458 | /* update watch filter fields */ |
445 | if (ndw) { | 459 | if (ndw) { |
446 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; | 460 | watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; |
447 | watch->ino = ndw->path.dentry->d_inode->i_ino; | 461 | watch->ino = ndw->path.dentry->d_inode->i_ino; |
448 | } | 462 | } |
449 | 463 | ||
450 | /* The audit_filter_mutex must not be held during inotify calls because | 464 | /* either find an old parent or attach a new one */ |
451 | * we hold it during inotify event callback processing. If an existing | 465 | parent = audit_find_parent(ndp->path.dentry->d_inode); |
452 | * inotify watch is found, inotify_find_watch() grabs a reference before | 466 | if (!parent) { |
453 | * returning. | ||
454 | */ | ||
455 | if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, | ||
456 | &i_watch) < 0) { | ||
457 | parent = audit_init_parent(ndp); | 467 | parent = audit_init_parent(ndp); |
458 | if (IS_ERR(parent)) { | 468 | if (IS_ERR(parent)) { |
459 | /* caller expects mutex locked */ | ||
460 | mutex_lock(&audit_filter_mutex); | ||
461 | ret = PTR_ERR(parent); | 469 | ret = PTR_ERR(parent); |
462 | goto error; | 470 | goto error; |
463 | } | 471 | } |
464 | } else | 472 | } |
465 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
466 | |||
467 | mutex_lock(&audit_filter_mutex); | ||
468 | 473 | ||
469 | /* parent was moved before we took audit_filter_mutex */ | 474 | audit_add_to_parent(krule, parent); |
470 | if (parent->flags & AUDIT_PARENT_INVALID) | ||
471 | ret = -ENOENT; | ||
472 | else | ||
473 | audit_add_to_parent(krule, parent); | ||
474 | 475 | ||
475 | /* match get in audit_init_parent or inotify_find_watch */ | 476 | /* match get in audit_find_parent or audit_init_parent */ |
476 | put_inotify_watch(&parent->wdata); | 477 | audit_put_parent(parent); |
477 | 478 | ||
479 | h = audit_hash_ino((u32)watch->ino); | ||
480 | *list = &audit_inode_hash[h]; | ||
478 | error: | 481 | error: |
479 | audit_put_nd(ndp, ndw); /* NULL args OK */ | 482 | audit_put_nd(ndp, ndw); /* NULL args OK */ |
480 | return ret; | 483 | return ret; |
481 | 484 | ||
482 | } | 485 | } |
483 | 486 | ||
484 | void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) | 487 | void audit_remove_watch_rule(struct audit_krule *krule) |
485 | { | 488 | { |
486 | struct audit_watch *watch = krule->watch; | 489 | struct audit_watch *watch = krule->watch; |
487 | struct audit_parent *parent = watch->parent; | 490 | struct audit_parent *parent = watch->parent; |
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) | |||
492 | audit_remove_watch(watch); | 495 | audit_remove_watch(watch); |
493 | 496 | ||
494 | if (list_empty(&parent->watches)) { | 497 | if (list_empty(&parent->watches)) { |
495 | /* Put parent on the inotify un-registration | 498 | audit_get_parent(parent); |
496 | * list. Grab a reference before releasing | 499 | fsnotify_destroy_mark(&parent->mark); |
497 | * audit_filter_mutex, to be released in | 500 | audit_put_parent(parent); |
498 | * audit_inotify_unregister(). | ||
499 | * If filesystem is going away, just leave | ||
500 | * the sucker alone, eviction will take | ||
501 | * care of it. */ | ||
502 | if (pin_inotify_watch(&parent->wdata)) | ||
503 | list_add(&parent->ilist, list); | ||
504 | } | 501 | } |
505 | } | 502 | } |
506 | } | 503 | } |
507 | 504 | ||
508 | /* Update watch data in audit rules based on inotify events. */ | 505 | static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, |
509 | static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, | 506 | struct fsnotify_mark *inode_mark, |
510 | u32 cookie, const char *dname, struct inode *inode) | 507 | struct fsnotify_mark *vfsmount_mark, |
508 | __u32 mask, void *data, int data_type) | ||
509 | { | ||
510 | return true; | ||
511 | } | ||
512 | |||
513 | /* Update watch data in audit rules based on fsnotify events. */ | ||
514 | static int audit_watch_handle_event(struct fsnotify_group *group, | ||
515 | struct fsnotify_mark *inode_mark, | ||
516 | struct fsnotify_mark *vfsmount_mark, | ||
517 | struct fsnotify_event *event) | ||
511 | { | 518 | { |
519 | struct inode *inode; | ||
520 | __u32 mask = event->mask; | ||
521 | const char *dname = event->file_name; | ||
512 | struct audit_parent *parent; | 522 | struct audit_parent *parent; |
513 | 523 | ||
514 | parent = container_of(i_watch, struct audit_parent, wdata); | 524 | parent = container_of(inode_mark, struct audit_parent, mark); |
515 | 525 | ||
516 | if (mask & (IN_CREATE|IN_MOVED_TO) && inode) | 526 | BUG_ON(group != audit_watch_group); |
517 | audit_update_watch(parent, dname, inode->i_sb->s_dev, | 527 | |
518 | inode->i_ino, 0); | 528 | switch (event->data_type) { |
519 | else if (mask & (IN_DELETE|IN_MOVED_FROM)) | 529 | case (FSNOTIFY_EVENT_PATH): |
530 | inode = event->path.dentry->d_inode; | ||
531 | break; | ||
532 | case (FSNOTIFY_EVENT_INODE): | ||
533 | inode = event->inode; | ||
534 | break; | ||
535 | default: | ||
536 | BUG(); | ||
537 | inode = NULL; | ||
538 | break; | ||
539 | }; | ||
540 | |||
541 | if (mask & (FS_CREATE|FS_MOVED_TO) && inode) | ||
542 | audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); | ||
543 | else if (mask & (FS_DELETE|FS_MOVED_FROM)) | ||
520 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); | 544 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); |
521 | /* inotify automatically removes the watch and sends IN_IGNORED */ | 545 | else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) |
522 | else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) | ||
523 | audit_remove_parent_watches(parent); | ||
524 | /* inotify does not remove the watch, so remove it manually */ | ||
525 | else if(mask & IN_MOVE_SELF) { | ||
526 | audit_remove_parent_watches(parent); | 546 | audit_remove_parent_watches(parent); |
527 | inotify_remove_watch_locked(audit_ih, i_watch); | 547 | |
528 | } else if (mask & IN_IGNORED) | 548 | return 0; |
529 | put_inotify_watch(i_watch); | ||
530 | } | 549 | } |
531 | 550 | ||
532 | static const struct inotify_operations audit_inotify_ops = { | 551 | static const struct fsnotify_ops audit_watch_fsnotify_ops = { |
533 | .handle_event = audit_handle_ievent, | 552 | .should_send_event = audit_watch_should_send_event, |
534 | .destroy_watch = audit_free_parent, | 553 | .handle_event = audit_watch_handle_event, |
554 | .free_group_priv = NULL, | ||
555 | .freeing_mark = NULL, | ||
556 | .free_event_priv = NULL, | ||
535 | }; | 557 | }; |
536 | 558 | ||
537 | static int __init audit_watch_init(void) | 559 | static int __init audit_watch_init(void) |
538 | { | 560 | { |
539 | audit_ih = inotify_init(&audit_inotify_ops); | 561 | audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); |
540 | if (IS_ERR(audit_ih)) | 562 | if (IS_ERR(audit_watch_group)) { |
541 | audit_panic("cannot initialize inotify handle"); | 563 | audit_watch_group = NULL; |
564 | audit_panic("cannot create audit fsnotify group"); | ||
565 | } | ||
542 | return 0; | 566 | return 0; |
543 | } | 567 | } |
544 | subsys_initcall(audit_watch_init); | 568 | device_initcall(audit_watch_init); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce08041f578d..eb7675499fb5 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
71 | { | 71 | { |
72 | int i; | 72 | int i; |
73 | struct audit_krule *erule = &e->rule; | 73 | struct audit_krule *erule = &e->rule; |
74 | |||
74 | /* some rules don't have associated watches */ | 75 | /* some rules don't have associated watches */ |
75 | if (erule->watch) | 76 | if (erule->watch) |
76 | audit_put_watch(erule->watch); | 77 | audit_put_watch(erule->watch); |
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, | |||
746 | * rule with the new rule in the filterlist, then free the old rule. | 747 | * rule with the new rule in the filterlist, then free the old rule. |
747 | * The rlist element is undefined; list manipulations are handled apart from | 748 | * The rlist element is undefined; list manipulations are handled apart from |
748 | * the initial copy. */ | 749 | * the initial copy. */ |
749 | struct audit_entry *audit_dupe_rule(struct audit_krule *old, | 750 | struct audit_entry *audit_dupe_rule(struct audit_krule *old) |
750 | struct audit_watch *watch) | ||
751 | { | 751 | { |
752 | u32 fcount = old->field_count; | 752 | u32 fcount = old->field_count; |
753 | struct audit_entry *entry; | 753 | struct audit_entry *entry; |
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
769 | new->prio = old->prio; | 769 | new->prio = old->prio; |
770 | new->buflen = old->buflen; | 770 | new->buflen = old->buflen; |
771 | new->inode_f = old->inode_f; | 771 | new->inode_f = old->inode_f; |
772 | new->watch = NULL; | ||
773 | new->field_count = old->field_count; | 772 | new->field_count = old->field_count; |
773 | |||
774 | /* | 774 | /* |
775 | * note that we are OK with not refcounting here; audit_match_tree() | 775 | * note that we are OK with not refcounting here; audit_match_tree() |
776 | * never dereferences tree and we can't get false positives there | 776 | * never dereferences tree and we can't get false positives there |
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
811 | } | 811 | } |
812 | } | 812 | } |
813 | 813 | ||
814 | if (watch) { | 814 | if (old->watch) { |
815 | audit_get_watch(watch); | 815 | audit_get_watch(old->watch); |
816 | new->watch = watch; | 816 | new->watch = old->watch; |
817 | } | 817 | } |
818 | 818 | ||
819 | return entry; | 819 | return entry; |
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
866 | struct audit_watch *watch = entry->rule.watch; | 866 | struct audit_watch *watch = entry->rule.watch; |
867 | struct audit_tree *tree = entry->rule.tree; | 867 | struct audit_tree *tree = entry->rule.tree; |
868 | struct list_head *list; | 868 | struct list_head *list; |
869 | int h, err; | 869 | int err; |
870 | #ifdef CONFIG_AUDITSYSCALL | 870 | #ifdef CONFIG_AUDITSYSCALL |
871 | int dont_count = 0; | 871 | int dont_count = 0; |
872 | 872 | ||
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry) | |||
889 | 889 | ||
890 | if (watch) { | 890 | if (watch) { |
891 | /* audit_filter_mutex is dropped and re-taken during this call */ | 891 | /* audit_filter_mutex is dropped and re-taken during this call */ |
892 | err = audit_add_watch(&entry->rule); | 892 | err = audit_add_watch(&entry->rule, &list); |
893 | if (err) { | 893 | if (err) { |
894 | mutex_unlock(&audit_filter_mutex); | 894 | mutex_unlock(&audit_filter_mutex); |
895 | goto error; | 895 | goto error; |
896 | } | 896 | } |
897 | /* entry->rule.watch may have changed during audit_add_watch() */ | ||
898 | watch = entry->rule.watch; | ||
899 | h = audit_hash_ino((u32)audit_watch_inode(watch)); | ||
900 | list = &audit_inode_hash[h]; | ||
901 | } | 897 | } |
902 | if (tree) { | 898 | if (tree) { |
903 | err = audit_add_tree_rule(&entry->rule); | 899 | err = audit_add_tree_rule(&entry->rule); |
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
949 | struct audit_watch *watch = entry->rule.watch; | 945 | struct audit_watch *watch = entry->rule.watch; |
950 | struct audit_tree *tree = entry->rule.tree; | 946 | struct audit_tree *tree = entry->rule.tree; |
951 | struct list_head *list; | 947 | struct list_head *list; |
952 | LIST_HEAD(inotify_list); | ||
953 | int ret = 0; | 948 | int ret = 0; |
954 | #ifdef CONFIG_AUDITSYSCALL | 949 | #ifdef CONFIG_AUDITSYSCALL |
955 | int dont_count = 0; | 950 | int dont_count = 0; |
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
969 | } | 964 | } |
970 | 965 | ||
971 | if (e->rule.watch) | 966 | if (e->rule.watch) |
972 | audit_remove_watch_rule(&e->rule, &inotify_list); | 967 | audit_remove_watch_rule(&e->rule); |
973 | 968 | ||
974 | if (e->rule.tree) | 969 | if (e->rule.tree) |
975 | audit_remove_tree_rule(&e->rule); | 970 | audit_remove_tree_rule(&e->rule); |
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry) | |||
987 | #endif | 982 | #endif |
988 | mutex_unlock(&audit_filter_mutex); | 983 | mutex_unlock(&audit_filter_mutex); |
989 | 984 | ||
990 | if (!list_empty(&inotify_list)) | ||
991 | audit_inotify_unregister(&inotify_list); | ||
992 | |||
993 | out: | 985 | out: |
994 | if (watch) | 986 | if (watch) |
995 | audit_put_watch(watch); /* match initial get */ | 987 | audit_put_watch(watch); /* match initial get */ |
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r) | |||
1323 | { | 1315 | { |
1324 | struct audit_entry *entry = container_of(r, struct audit_entry, rule); | 1316 | struct audit_entry *entry = container_of(r, struct audit_entry, rule); |
1325 | struct audit_entry *nentry; | 1317 | struct audit_entry *nentry; |
1326 | struct audit_watch *watch; | ||
1327 | struct audit_tree *tree; | ||
1328 | int err = 0; | 1318 | int err = 0; |
1329 | 1319 | ||
1330 | if (!security_audit_rule_known(r)) | 1320 | if (!security_audit_rule_known(r)) |
1331 | return 0; | 1321 | return 0; |
1332 | 1322 | ||
1333 | watch = r->watch; | 1323 | nentry = audit_dupe_rule(r); |
1334 | tree = r->tree; | ||
1335 | nentry = audit_dupe_rule(r, watch); | ||
1336 | if (IS_ERR(nentry)) { | 1324 | if (IS_ERR(nentry)) { |
1337 | /* save the first error encountered for the | 1325 | /* save the first error encountered for the |
1338 | * return value */ | 1326 | * return value */ |
1339 | err = PTR_ERR(nentry); | 1327 | err = PTR_ERR(nentry); |
1340 | audit_panic("error updating LSM filters"); | 1328 | audit_panic("error updating LSM filters"); |
1341 | if (watch) | 1329 | if (r->watch) |
1342 | list_del(&r->rlist); | 1330 | list_del(&r->rlist); |
1343 | list_del_rcu(&entry->list); | 1331 | list_del_rcu(&entry->list); |
1344 | list_del(&r->list); | 1332 | list_del(&r->list); |
1345 | } else { | 1333 | } else { |
1346 | if (watch) { | 1334 | if (r->watch || r->tree) |
1347 | list_add(&nentry->rule.rlist, audit_watch_rules(watch)); | ||
1348 | list_del(&r->rlist); | ||
1349 | } else if (tree) | ||
1350 | list_replace_init(&r->rlist, &nentry->rule.rlist); | 1335 | list_replace_init(&r->rlist, &nentry->rule.rlist); |
1351 | list_replace_rcu(&entry->list, &nentry->list); | 1336 | list_replace_rcu(&entry->list, &nentry->list); |
1352 | list_replace(&r->list, &nentry->rule.list); | 1337 | list_replace(&r->list, &nentry->rule.list); |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3828ad5fb8f1..1b31c130d034 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -65,7 +65,6 @@ | |||
65 | #include <linux/binfmts.h> | 65 | #include <linux/binfmts.h> |
66 | #include <linux/highmem.h> | 66 | #include <linux/highmem.h> |
67 | #include <linux/syscalls.h> | 67 | #include <linux/syscalls.h> |
68 | #include <linux/inotify.h> | ||
69 | #include <linux/capability.h> | 68 | #include <linux/capability.h> |
70 | #include <linux/fs_struct.h> | 69 | #include <linux/fs_struct.h> |
71 | 70 | ||
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
549 | } | 548 | } |
550 | break; | 549 | break; |
551 | case AUDIT_WATCH: | 550 | case AUDIT_WATCH: |
552 | if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) | 551 | if (name) |
553 | result = (name->dev == audit_watch_dev(rule->watch) && | 552 | result = audit_watch_compare(rule->watch, name->ino, name->dev); |
554 | name->ino == audit_watch_inode(rule->watch)); | ||
555 | break; | 553 | break; |
556 | case AUDIT_DIR: | 554 | case AUDIT_DIR: |
557 | if (ctx) | 555 | if (ctx) |
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode) | |||
1726 | struct audit_tree_refs *p; | 1724 | struct audit_tree_refs *p; |
1727 | struct audit_chunk *chunk; | 1725 | struct audit_chunk *chunk; |
1728 | int count; | 1726 | int count; |
1729 | if (likely(list_empty(&inode->inotify_watches))) | 1727 | if (likely(hlist_empty(&inode->i_fsnotify_marks))) |
1730 | return; | 1728 | return; |
1731 | context = current->audit_context; | 1729 | context = current->audit_context; |
1732 | p = context->trees; | 1730 | p = context->trees; |
@@ -1769,7 +1767,7 @@ retry: | |||
1769 | seq = read_seqbegin(&rename_lock); | 1767 | seq = read_seqbegin(&rename_lock); |
1770 | for(;;) { | 1768 | for(;;) { |
1771 | struct inode *inode = d->d_inode; | 1769 | struct inode *inode = d->d_inode; |
1772 | if (inode && unlikely(!list_empty(&inode->inotify_watches))) { | 1770 | if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { |
1773 | struct audit_chunk *chunk; | 1771 | struct audit_chunk *chunk; |
1774 | chunk = audit_tree_lookup(inode); | 1772 | chunk = audit_tree_lookup(inode); |
1775 | if (chunk) { | 1773 | if (chunk) { |
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name) | |||
1837 | context->names[context->name_count].ino = (unsigned long)-1; | 1835 | context->names[context->name_count].ino = (unsigned long)-1; |
1838 | context->names[context->name_count].osid = 0; | 1836 | context->names[context->name_count].osid = 0; |
1839 | ++context->name_count; | 1837 | ++context->name_count; |
1840 | if (!context->pwd.dentry) { | 1838 | if (!context->pwd.dentry) |
1841 | read_lock(¤t->fs->lock); | 1839 | get_fs_pwd(current->fs, &context->pwd); |
1842 | context->pwd = current->fs->pwd; | ||
1843 | path_get(¤t->fs->pwd); | ||
1844 | read_unlock(¤t->fs->lock); | ||
1845 | } | ||
1846 | |||
1847 | } | 1840 | } |
1848 | 1841 | ||
1849 | /* audit_putname - intercept a putname request | 1842 | /* audit_putname - intercept a putname request |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a8ce09954404..c9483d8f6140 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1102 | if (opts->release_agent) | 1102 | if (opts->release_agent) |
1103 | return -EINVAL; | 1103 | return -EINVAL; |
1104 | opts->release_agent = | 1104 | opts->release_agent = |
1105 | kstrndup(token + 14, PATH_MAX, GFP_KERNEL); | 1105 | kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); |
1106 | if (!opts->release_agent) | 1106 | if (!opts->release_agent) |
1107 | return -ENOMEM; | 1107 | return -ENOMEM; |
1108 | } else if (!strncmp(token, "name=", 5)) { | 1108 | } else if (!strncmp(token, "name=", 5)) { |
@@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1123 | if (opts->name) | 1123 | if (opts->name) |
1124 | return -EINVAL; | 1124 | return -EINVAL; |
1125 | opts->name = kstrndup(name, | 1125 | opts->name = kstrndup(name, |
1126 | MAX_CGROUP_ROOT_NAMELEN, | 1126 | MAX_CGROUP_ROOT_NAMELEN - 1, |
1127 | GFP_KERNEL); | 1127 | GFP_KERNEL); |
1128 | if (!opts->name) | 1128 | if (!opts->name) |
1129 | return -ENOMEM; | 1129 | return -ENOMEM; |
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { | |||
1623 | .kill_sb = cgroup_kill_sb, | 1623 | .kill_sb = cgroup_kill_sb, |
1624 | }; | 1624 | }; |
1625 | 1625 | ||
1626 | static struct kobject *cgroup_kobj; | ||
1627 | |||
1626 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 1628 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
1627 | { | 1629 | { |
1628 | return dentry->d_fsdata; | 1630 | return dentry->d_fsdata; |
@@ -1789,19 +1791,20 @@ out: | |||
1789 | } | 1791 | } |
1790 | 1792 | ||
1791 | /** | 1793 | /** |
1792 | * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup | 1794 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' |
1795 | * @from: attach to all cgroups of a given task | ||
1793 | * @tsk: the task to be attached | 1796 | * @tsk: the task to be attached |
1794 | */ | 1797 | */ |
1795 | int cgroup_attach_task_current_cg(struct task_struct *tsk) | 1798 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) |
1796 | { | 1799 | { |
1797 | struct cgroupfs_root *root; | 1800 | struct cgroupfs_root *root; |
1798 | struct cgroup *cur_cg; | ||
1799 | int retval = 0; | 1801 | int retval = 0; |
1800 | 1802 | ||
1801 | cgroup_lock(); | 1803 | cgroup_lock(); |
1802 | for_each_active_root(root) { | 1804 | for_each_active_root(root) { |
1803 | cur_cg = task_cgroup_from_root(current, root); | 1805 | struct cgroup *from_cg = task_cgroup_from_root(from, root); |
1804 | retval = cgroup_attach_task(cur_cg, tsk); | 1806 | |
1807 | retval = cgroup_attach_task(from_cg, tsk); | ||
1805 | if (retval) | 1808 | if (retval) |
1806 | break; | 1809 | break; |
1807 | } | 1810 | } |
@@ -1809,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk) | |||
1809 | 1812 | ||
1810 | return retval; | 1813 | return retval; |
1811 | } | 1814 | } |
1812 | EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); | 1815 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
1813 | 1816 | ||
1814 | /* | 1817 | /* |
1815 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1818 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex |
@@ -3894,9 +3897,18 @@ int __init cgroup_init(void) | |||
3894 | hhead = css_set_hash(init_css_set.subsys); | 3897 | hhead = css_set_hash(init_css_set.subsys); |
3895 | hlist_add_head(&init_css_set.hlist, hhead); | 3898 | hlist_add_head(&init_css_set.hlist, hhead); |
3896 | BUG_ON(!init_root_id(&rootnode)); | 3899 | BUG_ON(!init_root_id(&rootnode)); |
3900 | |||
3901 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | ||
3902 | if (!cgroup_kobj) { | ||
3903 | err = -ENOMEM; | ||
3904 | goto out; | ||
3905 | } | ||
3906 | |||
3897 | err = register_filesystem(&cgroup_fs_type); | 3907 | err = register_filesystem(&cgroup_fs_type); |
3898 | if (err < 0) | 3908 | if (err < 0) { |
3909 | kobject_put(cgroup_kobj); | ||
3899 | goto out; | 3910 | goto out; |
3911 | } | ||
3900 | 3912 | ||
3901 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | 3913 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); |
3902 | 3914 | ||
diff --git a/kernel/compat.c b/kernel/compat.c index 5adab05a3172..c9e2ec0b34a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, | |||
279 | struct compat_rlimit __user *rlim) | 279 | struct compat_rlimit __user *rlim) |
280 | { | 280 | { |
281 | struct rlimit r; | 281 | struct rlimit r; |
282 | int ret; | ||
283 | mm_segment_t old_fs = get_fs (); | ||
284 | |||
285 | if (resource >= RLIM_NLIMITS) | ||
286 | return -EINVAL; | ||
287 | 282 | ||
288 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || | 283 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || |
289 | __get_user(r.rlim_cur, &rlim->rlim_cur) || | 284 | __get_user(r.rlim_cur, &rlim->rlim_cur) || |
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, | |||
294 | r.rlim_cur = RLIM_INFINITY; | 289 | r.rlim_cur = RLIM_INFINITY; |
295 | if (r.rlim_max == COMPAT_RLIM_INFINITY) | 290 | if (r.rlim_max == COMPAT_RLIM_INFINITY) |
296 | r.rlim_max = RLIM_INFINITY; | 291 | r.rlim_max = RLIM_INFINITY; |
297 | set_fs(KERNEL_DS); | 292 | return do_prlimit(current, resource, &r, NULL); |
298 | ret = sys_setrlimit(resource, (struct rlimit __user *) &r); | ||
299 | set_fs(old_fs); | ||
300 | return ret; | ||
301 | } | 293 | } |
302 | 294 | ||
303 | #ifdef COMPAT_RLIM_OLD_INFINITY | 295 | #ifdef COMPAT_RLIM_OLD_INFINITY |
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, | |||
329 | 321 | ||
330 | #endif | 322 | #endif |
331 | 323 | ||
332 | asmlinkage long compat_sys_getrlimit (unsigned int resource, | 324 | asmlinkage long compat_sys_getrlimit(unsigned int resource, |
333 | struct compat_rlimit __user *rlim) | 325 | struct compat_rlimit __user *rlim) |
334 | { | 326 | { |
335 | struct rlimit r; | 327 | struct rlimit r; |
336 | int ret; | 328 | int ret; |
337 | mm_segment_t old_fs = get_fs(); | ||
338 | 329 | ||
339 | set_fs(KERNEL_DS); | 330 | ret = do_prlimit(current, resource, NULL, &r); |
340 | ret = sys_getrlimit(resource, (struct rlimit __user *) &r); | ||
341 | set_fs(old_fs); | ||
342 | if (!ret) { | 331 | if (!ret) { |
343 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | 332 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) |
344 | r.rlim_cur = COMPAT_RLIM_INFINITY; | 333 | r.rlim_cur = COMPAT_RLIM_INFINITY; |
@@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) | |||
1137 | 1126 | ||
1138 | return 0; | 1127 | return 0; |
1139 | } | 1128 | } |
1129 | |||
1130 | /* | ||
1131 | * Allocate user-space memory for the duration of a single system call, | ||
1132 | * in order to marshall parameters inside a compat thunk. | ||
1133 | */ | ||
1134 | void __user *compat_alloc_user_space(unsigned long len) | ||
1135 | { | ||
1136 | void __user *ptr; | ||
1137 | |||
1138 | /* If len would occupy more than half of the entire compat space... */ | ||
1139 | if (unlikely(len > (((compat_uptr_t)~0) >> 1))) | ||
1140 | return NULL; | ||
1141 | |||
1142 | ptr = arch_compat_alloc_user_space(len); | ||
1143 | |||
1144 | if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) | ||
1145 | return NULL; | ||
1146 | |||
1147 | return ptr; | ||
1148 | } | ||
1149 | EXPORT_SYMBOL_GPL(compat_alloc_user_space); | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
235 | return -EINVAL; | 235 | return -EINVAL; |
236 | 236 | ||
237 | cpu_hotplug_begin(); | 237 | cpu_hotplug_begin(); |
238 | set_cpu_active(cpu, false); | ||
239 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 238 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
240 | if (err) { | 239 | if (err) { |
241 | set_cpu_active(cpu, true); | ||
242 | |||
243 | nr_calls--; | 240 | nr_calls--; |
244 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 241 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
245 | printk("%s: attempt to take down CPU %u failed\n", | 242 | printk("%s: attempt to take down CPU %u failed\n", |
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
249 | 246 | ||
250 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 247 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
251 | if (err) { | 248 | if (err) { |
252 | set_cpu_active(cpu, true); | ||
253 | /* CPU didn't die: tell everyone. Can't complain. */ | 249 | /* CPU didn't die: tell everyone. Can't complain. */ |
254 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 250 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
255 | 251 | ||
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
321 | goto out_notify; | 317 | goto out_notify; |
322 | BUG_ON(!cpu_online(cpu)); | 318 | BUG_ON(!cpu_online(cpu)); |
323 | 319 | ||
324 | set_cpu_active(cpu, true); | ||
325 | |||
326 | /* Now call notifier in preparation. */ | 320 | /* Now call notifier in preparation. */ |
327 | cpu_notify(CPU_ONLINE | mod, hcpu); | 321 | cpu_notify(CPU_ONLINE | mod, hcpu); |
328 | 322 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7cb37d86a005..b23c0979bbe7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2113 | * but making no active use of cpusets. | 2113 | * but making no active use of cpusets. |
2114 | * | 2114 | * |
2115 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2115 | * This routine ensures that top_cpuset.cpus_allowed tracks |
2116 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 2116 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
2117 | * | 2117 | * |
2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
2119 | * before calling generate_sched_domains(). | 2119 | * before calling generate_sched_domains(). |
2120 | */ | 2120 | */ |
2121 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | 2121 | void cpuset_update_active_cpus(void) |
2122 | unsigned long phase, void *unused_cpu) | ||
2123 | { | 2122 | { |
2124 | struct sched_domain_attr *attr; | 2123 | struct sched_domain_attr *attr; |
2125 | cpumask_var_t *doms; | 2124 | cpumask_var_t *doms; |
2126 | int ndoms; | 2125 | int ndoms; |
2127 | 2126 | ||
2128 | switch (phase) { | ||
2129 | case CPU_ONLINE: | ||
2130 | case CPU_ONLINE_FROZEN: | ||
2131 | case CPU_DOWN_PREPARE: | ||
2132 | case CPU_DOWN_PREPARE_FROZEN: | ||
2133 | case CPU_DOWN_FAILED: | ||
2134 | case CPU_DOWN_FAILED_FROZEN: | ||
2135 | break; | ||
2136 | |||
2137 | default: | ||
2138 | return NOTIFY_DONE; | ||
2139 | } | ||
2140 | |||
2141 | cgroup_lock(); | 2127 | cgroup_lock(); |
2142 | mutex_lock(&callback_mutex); | 2128 | mutex_lock(&callback_mutex); |
2143 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2129 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2148 | 2134 | ||
2149 | /* Have scheduler rebuild the domains */ | 2135 | /* Have scheduler rebuild the domains */ |
2150 | partition_sched_domains(ndoms, doms, attr); | 2136 | partition_sched_domains(ndoms, doms, attr); |
2151 | |||
2152 | return NOTIFY_OK; | ||
2153 | } | 2137 | } |
2154 | 2138 | ||
2155 | #ifdef CONFIG_MEMORY_HOTPLUG | 2139 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) | |||
2203 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2187 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2204 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2188 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2205 | 2189 | ||
2206 | hotcpu_notifier(cpuset_track_online_cpus, 0); | ||
2207 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2190 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2208 | 2191 | ||
2209 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2192 | cpuset_wq = create_singlethread_workqueue("cpuset"); |
diff --git a/kernel/cred.c b/kernel/cred.c index 60bc8b1e32e6..9a3e22641fe7 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -22,10 +22,6 @@ | |||
22 | #define kdebug(FMT, ...) \ | 22 | #define kdebug(FMT, ...) \ |
23 | printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 23 | printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) |
24 | #else | 24 | #else |
25 | static inline __attribute__((format(printf, 1, 2))) | ||
26 | void no_printk(const char *fmt, ...) | ||
27 | { | ||
28 | } | ||
29 | #define kdebug(FMT, ...) \ | 25 | #define kdebug(FMT, ...) \ |
30 | no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) | 26 | no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) |
31 | #endif | 27 | #endif |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 51d14fe87648..de407c78178d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -605,6 +605,8 @@ cpu_master_loop: | |||
605 | if (dbg_kdb_mode) { | 605 | if (dbg_kdb_mode) { |
606 | kgdb_connected = 1; | 606 | kgdb_connected = 1; |
607 | error = kdb_stub(ks); | 607 | error = kdb_stub(ks); |
608 | if (error == -1) | ||
609 | continue; | ||
608 | kgdb_connected = 0; | 610 | kgdb_connected = 0; |
609 | } else { | 611 | } else { |
610 | error = gdb_serial_stub(ks); | 612 | error = gdb_serial_stub(ks); |
@@ -739,7 +741,7 @@ static struct console kgdbcons = { | |||
739 | }; | 741 | }; |
740 | 742 | ||
741 | #ifdef CONFIG_MAGIC_SYSRQ | 743 | #ifdef CONFIG_MAGIC_SYSRQ |
742 | static void sysrq_handle_dbg(int key, struct tty_struct *tty) | 744 | static void sysrq_handle_dbg(int key) |
743 | { | 745 | { |
744 | if (!dbg_io_ops) { | 746 | if (!dbg_io_ops) { |
745 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); | 747 | printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 6e81fd59566b..481a7bd2dfe7 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + | |||
52 | * GDB remote protocol parser: | 52 | * GDB remote protocol parser: |
53 | */ | 53 | */ |
54 | 54 | ||
55 | static int hex(char ch) | ||
56 | { | ||
57 | if ((ch >= 'a') && (ch <= 'f')) | ||
58 | return ch - 'a' + 10; | ||
59 | if ((ch >= '0') && (ch <= '9')) | ||
60 | return ch - '0'; | ||
61 | if ((ch >= 'A') && (ch <= 'F')) | ||
62 | return ch - 'A' + 10; | ||
63 | return -1; | ||
64 | } | ||
65 | |||
66 | #ifdef CONFIG_KGDB_KDB | 55 | #ifdef CONFIG_KGDB_KDB |
67 | static int gdbstub_read_wait(void) | 56 | static int gdbstub_read_wait(void) |
68 | { | 57 | { |
@@ -123,8 +112,8 @@ static void get_packet(char *buffer) | |||
123 | buffer[count] = 0; | 112 | buffer[count] = 0; |
124 | 113 | ||
125 | if (ch == '#') { | 114 | if (ch == '#') { |
126 | xmitcsum = hex(gdbstub_read_wait()) << 4; | 115 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; |
127 | xmitcsum += hex(gdbstub_read_wait()); | 116 | xmitcsum += hex_to_bin(gdbstub_read_wait()); |
128 | 117 | ||
129 | if (checksum != xmitcsum) | 118 | if (checksum != xmitcsum) |
130 | /* failed checksum */ | 119 | /* failed checksum */ |
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) | |||
236 | * buf. Return a pointer to the last char put in buf (null). May | 225 | * buf. Return a pointer to the last char put in buf (null). May |
237 | * return an error. | 226 | * return an error. |
238 | */ | 227 | */ |
239 | int kgdb_mem2hex(char *mem, char *buf, int count) | 228 | char *kgdb_mem2hex(char *mem, char *buf, int count) |
240 | { | 229 | { |
241 | char *tmp; | 230 | char *tmp; |
242 | int err; | 231 | int err; |
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) | |||
248 | tmp = buf + count; | 237 | tmp = buf + count; |
249 | 238 | ||
250 | err = probe_kernel_read(tmp, mem, count); | 239 | err = probe_kernel_read(tmp, mem, count); |
251 | if (!err) { | 240 | if (err) |
252 | while (count > 0) { | 241 | return NULL; |
253 | buf = pack_hex_byte(buf, *tmp); | 242 | while (count > 0) { |
254 | tmp++; | 243 | buf = pack_hex_byte(buf, *tmp); |
255 | count--; | 244 | tmp++; |
256 | } | 245 | count--; |
257 | |||
258 | *buf = 0; | ||
259 | } | 246 | } |
247 | *buf = 0; | ||
260 | 248 | ||
261 | return err; | 249 | return buf; |
262 | } | 250 | } |
263 | 251 | ||
264 | /* | 252 | /* |
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) | |||
280 | tmp_hex = tmp_raw - 1; | 268 | tmp_hex = tmp_raw - 1; |
281 | while (tmp_hex >= buf) { | 269 | while (tmp_hex >= buf) { |
282 | tmp_raw--; | 270 | tmp_raw--; |
283 | *tmp_raw = hex(*tmp_hex--); | 271 | *tmp_raw = hex_to_bin(*tmp_hex--); |
284 | *tmp_raw |= hex(*tmp_hex--) << 4; | 272 | *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; |
285 | } | 273 | } |
286 | 274 | ||
287 | return probe_kernel_write(mem, tmp_raw, count); | 275 | return probe_kernel_write(mem, tmp_raw, count); |
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) | |||
304 | (*ptr)++; | 292 | (*ptr)++; |
305 | } | 293 | } |
306 | while (**ptr) { | 294 | while (**ptr) { |
307 | hex_val = hex(**ptr); | 295 | hex_val = hex_to_bin(**ptr); |
308 | if (hex_val < 0) | 296 | if (hex_val < 0) |
309 | break; | 297 | break; |
310 | 298 | ||
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) | |||
339 | return probe_kernel_write(mem, c, size); | 327 | return probe_kernel_write(mem, c, size); |
340 | } | 328 | } |
341 | 329 | ||
330 | #if DBG_MAX_REG_NUM > 0 | ||
331 | void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
332 | { | ||
333 | int i; | ||
334 | int idx = 0; | ||
335 | char *ptr = (char *)gdb_regs; | ||
336 | |||
337 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
338 | dbg_get_reg(i, ptr + idx, regs); | ||
339 | idx += dbg_reg_def[i].size; | ||
340 | } | ||
341 | } | ||
342 | |||
343 | void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) | ||
344 | { | ||
345 | int i; | ||
346 | int idx = 0; | ||
347 | char *ptr = (char *)gdb_regs; | ||
348 | |||
349 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
350 | dbg_set_reg(i, ptr + idx, regs); | ||
351 | idx += dbg_reg_def[i].size; | ||
352 | } | ||
353 | } | ||
354 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
355 | |||
342 | /* Write memory due to an 'M' or 'X' packet. */ | 356 | /* Write memory due to an 'M' or 'X' packet. */ |
343 | static int write_mem_msg(int binary) | 357 | static int write_mem_msg(int binary) |
344 | { | 358 | { |
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) | |||
378 | * remapped to negative TIDs. | 392 | * remapped to negative TIDs. |
379 | */ | 393 | */ |
380 | 394 | ||
381 | #define BUF_THREAD_ID_SIZE 16 | 395 | #define BUF_THREAD_ID_SIZE 8 |
382 | 396 | ||
383 | static char *pack_threadid(char *pkt, unsigned char *id) | 397 | static char *pack_threadid(char *pkt, unsigned char *id) |
384 | { | 398 | { |
385 | char *limit; | 399 | unsigned char *limit; |
400 | int lzero = 1; | ||
401 | |||
402 | limit = id + (BUF_THREAD_ID_SIZE / 2); | ||
403 | while (id < limit) { | ||
404 | if (!lzero || *id != 0) { | ||
405 | pkt = pack_hex_byte(pkt, *id); | ||
406 | lzero = 0; | ||
407 | } | ||
408 | id++; | ||
409 | } | ||
386 | 410 | ||
387 | limit = pkt + BUF_THREAD_ID_SIZE; | 411 | if (lzero) |
388 | while (pkt < limit) | 412 | pkt = pack_hex_byte(pkt, 0); |
389 | pkt = pack_hex_byte(pkt, *id++); | ||
390 | 413 | ||
391 | return pkt; | 414 | return pkt; |
392 | } | 415 | } |
393 | 416 | ||
394 | static void int_to_threadref(unsigned char *id, int value) | 417 | static void int_to_threadref(unsigned char *id, int value) |
395 | { | 418 | { |
396 | unsigned char *scan; | 419 | put_unaligned_be32(value, id); |
397 | int i = 4; | ||
398 | |||
399 | scan = (unsigned char *)id; | ||
400 | while (i--) | ||
401 | *scan++ = 0; | ||
402 | put_unaligned_be32(value, scan); | ||
403 | } | 420 | } |
404 | 421 | ||
405 | static struct task_struct *getthread(struct pt_regs *regs, int tid) | 422 | static struct task_struct *getthread(struct pt_regs *regs, int tid) |
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) | |||
463 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); | 480 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); |
464 | } | 481 | } |
465 | 482 | ||
466 | /* Handle the 'g' get registers request */ | 483 | static void gdb_get_regs_helper(struct kgdb_state *ks) |
467 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
468 | { | 484 | { |
469 | struct task_struct *thread; | 485 | struct task_struct *thread; |
470 | void *local_debuggerinfo; | 486 | void *local_debuggerinfo; |
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) | |||
505 | */ | 521 | */ |
506 | sleeping_thread_to_gdb_regs(gdb_regs, thread); | 522 | sleeping_thread_to_gdb_regs(gdb_regs, thread); |
507 | } | 523 | } |
524 | } | ||
525 | |||
526 | /* Handle the 'g' get registers request */ | ||
527 | static void gdb_cmd_getregs(struct kgdb_state *ks) | ||
528 | { | ||
529 | gdb_get_regs_helper(ks); | ||
508 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); | 530 | kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); |
509 | } | 531 | } |
510 | 532 | ||
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) | |||
527 | char *ptr = &remcom_in_buffer[1]; | 549 | char *ptr = &remcom_in_buffer[1]; |
528 | unsigned long length; | 550 | unsigned long length; |
529 | unsigned long addr; | 551 | unsigned long addr; |
530 | int err; | 552 | char *err; |
531 | 553 | ||
532 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && | 554 | if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && |
533 | kgdb_hex2long(&ptr, &length) > 0) { | 555 | kgdb_hex2long(&ptr, &length) > 0) { |
534 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); | 556 | err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); |
535 | if (err) | 557 | if (!err) |
536 | error_packet(remcom_out_buffer, err); | 558 | error_packet(remcom_out_buffer, -EINVAL); |
537 | } else { | 559 | } else { |
538 | error_packet(remcom_out_buffer, -EINVAL); | 560 | error_packet(remcom_out_buffer, -EINVAL); |
539 | } | 561 | } |
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) | |||
550 | strcpy(remcom_out_buffer, "OK"); | 572 | strcpy(remcom_out_buffer, "OK"); |
551 | } | 573 | } |
552 | 574 | ||
575 | #if DBG_MAX_REG_NUM > 0 | ||
576 | static char *gdb_hex_reg_helper(int regnum, char *out) | ||
577 | { | ||
578 | int i; | ||
579 | int offset = 0; | ||
580 | |||
581 | for (i = 0; i < regnum; i++) | ||
582 | offset += dbg_reg_def[i].size; | ||
583 | return kgdb_mem2hex((char *)gdb_regs + offset, out, | ||
584 | dbg_reg_def[i].size); | ||
585 | } | ||
586 | |||
587 | /* Handle the 'p' individual regster get */ | ||
588 | static void gdb_cmd_reg_get(struct kgdb_state *ks) | ||
589 | { | ||
590 | unsigned long regnum; | ||
591 | char *ptr = &remcom_in_buffer[1]; | ||
592 | |||
593 | kgdb_hex2long(&ptr, ®num); | ||
594 | if (regnum >= DBG_MAX_REG_NUM) { | ||
595 | error_packet(remcom_out_buffer, -EINVAL); | ||
596 | return; | ||
597 | } | ||
598 | gdb_get_regs_helper(ks); | ||
599 | gdb_hex_reg_helper(regnum, remcom_out_buffer); | ||
600 | } | ||
601 | |||
602 | /* Handle the 'P' individual regster set */ | ||
603 | static void gdb_cmd_reg_set(struct kgdb_state *ks) | ||
604 | { | ||
605 | unsigned long regnum; | ||
606 | char *ptr = &remcom_in_buffer[1]; | ||
607 | int i = 0; | ||
608 | |||
609 | kgdb_hex2long(&ptr, ®num); | ||
610 | if (*ptr++ != '=' || | ||
611 | !(!kgdb_usethread || kgdb_usethread == current) || | ||
612 | !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { | ||
613 | error_packet(remcom_out_buffer, -EINVAL); | ||
614 | return; | ||
615 | } | ||
616 | memset(gdb_regs, 0, sizeof(gdb_regs)); | ||
617 | while (i < sizeof(gdb_regs) * 2) | ||
618 | if (hex_to_bin(ptr[i]) >= 0) | ||
619 | i++; | ||
620 | else | ||
621 | break; | ||
622 | i = i / 2; | ||
623 | kgdb_hex2mem(ptr, (char *)gdb_regs, i); | ||
624 | dbg_set_reg(regnum, gdb_regs, ks->linux_regs); | ||
625 | strcpy(remcom_out_buffer, "OK"); | ||
626 | } | ||
627 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
628 | |||
553 | /* Handle the 'X' memory binary write bytes */ | 629 | /* Handle the 'X' memory binary write bytes */ |
554 | static void gdb_cmd_binwrite(struct kgdb_state *ks) | 630 | static void gdb_cmd_binwrite(struct kgdb_state *ks) |
555 | { | 631 | { |
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
612 | { | 688 | { |
613 | struct task_struct *g; | 689 | struct task_struct *g; |
614 | struct task_struct *p; | 690 | struct task_struct *p; |
615 | unsigned char thref[8]; | 691 | unsigned char thref[BUF_THREAD_ID_SIZE]; |
616 | char *ptr; | 692 | char *ptr; |
617 | int i; | 693 | int i; |
618 | int cpu; | 694 | int cpu; |
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
632 | for_each_online_cpu(cpu) { | 708 | for_each_online_cpu(cpu) { |
633 | ks->thr_query = 0; | 709 | ks->thr_query = 0; |
634 | int_to_threadref(thref, -cpu - 2); | 710 | int_to_threadref(thref, -cpu - 2); |
635 | pack_threadid(ptr, thref); | 711 | ptr = pack_threadid(ptr, thref); |
636 | ptr += BUF_THREAD_ID_SIZE; | ||
637 | *(ptr++) = ','; | 712 | *(ptr++) = ','; |
638 | i++; | 713 | i++; |
639 | } | 714 | } |
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) | |||
642 | do_each_thread(g, p) { | 717 | do_each_thread(g, p) { |
643 | if (i >= ks->thr_query && !finished) { | 718 | if (i >= ks->thr_query && !finished) { |
644 | int_to_threadref(thref, p->pid); | 719 | int_to_threadref(thref, p->pid); |
645 | pack_threadid(ptr, thref); | 720 | ptr = pack_threadid(ptr, thref); |
646 | ptr += BUF_THREAD_ID_SIZE; | ||
647 | *(ptr++) = ','; | 721 | *(ptr++) = ','; |
648 | ks->thr_query++; | 722 | ks->thr_query++; |
649 | if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) | 723 | if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) |
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
858 | int error = 0; | 932 | int error = 0; |
859 | int tmp; | 933 | int tmp; |
860 | 934 | ||
861 | /* Clear the out buffer. */ | 935 | /* Initialize comm buffer and globals. */ |
862 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); | 936 | memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); |
937 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
938 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
939 | ks->pass_exception = 0; | ||
863 | 940 | ||
864 | if (kgdb_connected) { | 941 | if (kgdb_connected) { |
865 | unsigned char thref[8]; | 942 | unsigned char thref[BUF_THREAD_ID_SIZE]; |
866 | char *ptr; | 943 | char *ptr; |
867 | 944 | ||
868 | /* Reply to host that an exception has occurred */ | 945 | /* Reply to host that an exception has occurred */ |
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
876 | put_packet(remcom_out_buffer); | 953 | put_packet(remcom_out_buffer); |
877 | } | 954 | } |
878 | 955 | ||
879 | kgdb_usethread = kgdb_info[ks->cpu].task; | ||
880 | ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); | ||
881 | ks->pass_exception = 0; | ||
882 | |||
883 | while (1) { | 956 | while (1) { |
884 | error = 0; | 957 | error = 0; |
885 | 958 | ||
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
904 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | 977 | case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ |
905 | gdb_cmd_memwrite(ks); | 978 | gdb_cmd_memwrite(ks); |
906 | break; | 979 | break; |
980 | #if DBG_MAX_REG_NUM > 0 | ||
981 | case 'p': /* pXX Return gdb register XX (in hex) */ | ||
982 | gdb_cmd_reg_get(ks); | ||
983 | break; | ||
984 | case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ | ||
985 | gdb_cmd_reg_set(ks); | ||
986 | break; | ||
987 | #endif /* DBG_MAX_REG_NUM > 0 */ | ||
907 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ | 988 | case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ |
908 | gdb_cmd_binwrite(ks); | 989 | gdb_cmd_binwrite(ks); |
909 | break; | 990 | break; |
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3ebbb7..20059ef4459a 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c | |||
@@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) | |||
274 | int i, bpno; | 274 | int i, bpno; |
275 | kdb_bp_t *bp, *bp_check; | 275 | kdb_bp_t *bp, *bp_check; |
276 | int diag; | 276 | int diag; |
277 | int free; | ||
278 | char *symname = NULL; | 277 | char *symname = NULL; |
279 | long offset = 0ul; | 278 | long offset = 0ul; |
280 | int nextarg; | 279 | int nextarg; |
@@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) | |||
305 | /* | 304 | /* |
306 | * Find an empty bp structure to allocate | 305 | * Find an empty bp structure to allocate |
307 | */ | 306 | */ |
308 | free = KDB_MAXBPT; | ||
309 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { | 307 | for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { |
310 | if (bp->bp_free) | 308 | if (bp->bp_free) |
311 | break; | 309 | break; |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index ebe4a287419e..caf057a3de0e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) | |||
312 | 312 | ||
313 | if (endp == arg) { | 313 | if (endp == arg) { |
314 | /* | 314 | /* |
315 | * Try base 16, for us folks too lazy to type the | 315 | * Also try base 16, for us folks too lazy to type the |
316 | * leading 0x... | 316 | * leading 0x... |
317 | */ | 317 | */ |
318 | val = simple_strtoul(arg, &endp, 16); | 318 | val = simple_strtoul(arg, &endp, 16); |
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) | |||
325 | return 0; | 325 | return 0; |
326 | } | 326 | } |
327 | 327 | ||
328 | int kdbgetu64arg(const char *arg, u64 *value) | ||
329 | { | ||
330 | char *endp; | ||
331 | u64 val; | ||
332 | |||
333 | val = simple_strtoull(arg, &endp, 0); | ||
334 | |||
335 | if (endp == arg) { | ||
336 | |||
337 | val = simple_strtoull(arg, &endp, 16); | ||
338 | if (endp == arg) | ||
339 | return KDB_BADINT; | ||
340 | } | ||
341 | |||
342 | *value = val; | ||
343 | |||
344 | return 0; | ||
345 | } | ||
346 | |||
328 | /* | 347 | /* |
329 | * kdb_set - This function implements the 'set' command. Alter an | 348 | * kdb_set - This function implements the 'set' command. Alter an |
330 | * existing environment variable or create a new one. | 349 | * existing environment variable or create a new one. |
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) | |||
1770 | */ | 1789 | */ |
1771 | static int kdb_rd(int argc, const char **argv) | 1790 | static int kdb_rd(int argc, const char **argv) |
1772 | { | 1791 | { |
1773 | int diag = kdb_check_regs(); | 1792 | int len = kdb_check_regs(); |
1774 | if (diag) | 1793 | #if DBG_MAX_REG_NUM > 0 |
1775 | return diag; | 1794 | int i; |
1795 | char *rname; | ||
1796 | int rsize; | ||
1797 | u64 reg64; | ||
1798 | u32 reg32; | ||
1799 | u16 reg16; | ||
1800 | u8 reg8; | ||
1801 | |||
1802 | if (len) | ||
1803 | return len; | ||
1804 | |||
1805 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
1806 | rsize = dbg_reg_def[i].size * 2; | ||
1807 | if (rsize > 16) | ||
1808 | rsize = 2; | ||
1809 | if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { | ||
1810 | len = 0; | ||
1811 | kdb_printf("\n"); | ||
1812 | } | ||
1813 | if (len) | ||
1814 | len += kdb_printf(" "); | ||
1815 | switch(dbg_reg_def[i].size * 8) { | ||
1816 | case 8: | ||
1817 | rname = dbg_get_reg(i, ®8, kdb_current_regs); | ||
1818 | if (!rname) | ||
1819 | break; | ||
1820 | len += kdb_printf("%s: %02x", rname, reg8); | ||
1821 | break; | ||
1822 | case 16: | ||
1823 | rname = dbg_get_reg(i, ®16, kdb_current_regs); | ||
1824 | if (!rname) | ||
1825 | break; | ||
1826 | len += kdb_printf("%s: %04x", rname, reg16); | ||
1827 | break; | ||
1828 | case 32: | ||
1829 | rname = dbg_get_reg(i, ®32, kdb_current_regs); | ||
1830 | if (!rname) | ||
1831 | break; | ||
1832 | len += kdb_printf("%s: %08x", rname, reg32); | ||
1833 | break; | ||
1834 | case 64: | ||
1835 | rname = dbg_get_reg(i, ®64, kdb_current_regs); | ||
1836 | if (!rname) | ||
1837 | break; | ||
1838 | len += kdb_printf("%s: %016llx", rname, reg64); | ||
1839 | break; | ||
1840 | default: | ||
1841 | len += kdb_printf("%s: ??", dbg_reg_def[i].name); | ||
1842 | } | ||
1843 | } | ||
1844 | kdb_printf("\n"); | ||
1845 | #else | ||
1846 | if (len) | ||
1847 | return len; | ||
1776 | 1848 | ||
1777 | kdb_dumpregs(kdb_current_regs); | 1849 | kdb_dumpregs(kdb_current_regs); |
1850 | #endif | ||
1778 | return 0; | 1851 | return 0; |
1779 | } | 1852 | } |
1780 | 1853 | ||
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) | |||
1782 | * kdb_rm - This function implements the 'rm' (register modify) command. | 1855 | * kdb_rm - This function implements the 'rm' (register modify) command. |
1783 | * rm register-name new-contents | 1856 | * rm register-name new-contents |
1784 | * Remarks: | 1857 | * Remarks: |
1785 | * Currently doesn't allow modification of control or | 1858 | * Allows register modification with the same restrictions as gdb |
1786 | * debug registers. | ||
1787 | */ | 1859 | */ |
1788 | static int kdb_rm(int argc, const char **argv) | 1860 | static int kdb_rm(int argc, const char **argv) |
1789 | { | 1861 | { |
1862 | #if DBG_MAX_REG_NUM > 0 | ||
1790 | int diag; | 1863 | int diag; |
1791 | int ind = 0; | 1864 | const char *rname; |
1792 | unsigned long contents; | 1865 | int i; |
1866 | u64 reg64; | ||
1867 | u32 reg32; | ||
1868 | u16 reg16; | ||
1869 | u8 reg8; | ||
1793 | 1870 | ||
1794 | if (argc != 2) | 1871 | if (argc != 2) |
1795 | return KDB_ARGCOUNT; | 1872 | return KDB_ARGCOUNT; |
1796 | /* | 1873 | /* |
1797 | * Allow presence or absence of leading '%' symbol. | 1874 | * Allow presence or absence of leading '%' symbol. |
1798 | */ | 1875 | */ |
1799 | if (argv[1][0] == '%') | 1876 | rname = argv[1]; |
1800 | ind = 1; | 1877 | if (*rname == '%') |
1878 | rname++; | ||
1801 | 1879 | ||
1802 | diag = kdbgetularg(argv[2], &contents); | 1880 | diag = kdbgetu64arg(argv[2], ®64); |
1803 | if (diag) | 1881 | if (diag) |
1804 | return diag; | 1882 | return diag; |
1805 | 1883 | ||
1806 | diag = kdb_check_regs(); | 1884 | diag = kdb_check_regs(); |
1807 | if (diag) | 1885 | if (diag) |
1808 | return diag; | 1886 | return diag; |
1887 | |||
1888 | diag = KDB_BADREG; | ||
1889 | for (i = 0; i < DBG_MAX_REG_NUM; i++) { | ||
1890 | if (strcmp(rname, dbg_reg_def[i].name) == 0) { | ||
1891 | diag = 0; | ||
1892 | break; | ||
1893 | } | ||
1894 | } | ||
1895 | if (!diag) { | ||
1896 | switch(dbg_reg_def[i].size * 8) { | ||
1897 | case 8: | ||
1898 | reg8 = reg64; | ||
1899 | dbg_set_reg(i, ®8, kdb_current_regs); | ||
1900 | break; | ||
1901 | case 16: | ||
1902 | reg16 = reg64; | ||
1903 | dbg_set_reg(i, ®16, kdb_current_regs); | ||
1904 | break; | ||
1905 | case 32: | ||
1906 | reg32 = reg64; | ||
1907 | dbg_set_reg(i, ®32, kdb_current_regs); | ||
1908 | break; | ||
1909 | case 64: | ||
1910 | dbg_set_reg(i, ®64, kdb_current_regs); | ||
1911 | break; | ||
1912 | } | ||
1913 | } | ||
1914 | return diag; | ||
1915 | #else | ||
1809 | kdb_printf("ERROR: Register set currently not implemented\n"); | 1916 | kdb_printf("ERROR: Register set currently not implemented\n"); |
1810 | return 0; | 1917 | return 0; |
1918 | #endif | ||
1811 | } | 1919 | } |
1812 | 1920 | ||
1813 | #if defined(CONFIG_MAGIC_SYSRQ) | 1921 | #if defined(CONFIG_MAGIC_SYSRQ) |
@@ -1821,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv) | |||
1821 | if (argc != 1) | 1929 | if (argc != 1) |
1822 | return KDB_ARGCOUNT; | 1930 | return KDB_ARGCOUNT; |
1823 | kdb_trap_printk++; | 1931 | kdb_trap_printk++; |
1824 | __handle_sysrq(*argv[1], NULL, 0); | 1932 | __handle_sysrq(*argv[1], false); |
1825 | kdb_trap_printk--; | 1933 | kdb_trap_printk--; |
1826 | 1934 | ||
1827 | return 0; | 1935 | return 0; |
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) | |||
2440 | */ | 2548 | */ |
2441 | static int kdb_summary(int argc, const char **argv) | 2549 | static int kdb_summary(int argc, const char **argv) |
2442 | { | 2550 | { |
2551 | struct timespec now; | ||
2443 | struct kdb_tm tm; | 2552 | struct kdb_tm tm; |
2444 | struct sysinfo val; | 2553 | struct sysinfo val; |
2445 | 2554 | ||
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) | |||
2454 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); | 2563 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); |
2455 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); | 2564 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); |
2456 | 2565 | ||
2457 | kdb_gmtime(&xtime, &tm); | 2566 | now = __current_kernel_time(); |
2567 | kdb_gmtime(&now, &tm); | ||
2458 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " | 2568 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " |
2459 | "tz_minuteswest %d\n", | 2569 | "tz_minuteswest %d\n", |
2460 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, | 2570 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba69775d..be775f7e81e0 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); | |||
144 | extern int kdb_putword(unsigned long, unsigned long, size_t); | 144 | extern int kdb_putword(unsigned long, unsigned long, size_t); |
145 | 145 | ||
146 | extern int kdbgetularg(const char *, unsigned long *); | 146 | extern int kdbgetularg(const char *, unsigned long *); |
147 | extern int kdb_set(int, const char **); | ||
148 | extern char *kdbgetenv(const char *); | 147 | extern char *kdbgetenv(const char *); |
149 | extern int kdbgetintenv(const char *, int *); | ||
150 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, | 148 | extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, |
151 | long *, char **); | 149 | long *, char **); |
152 | extern int kdbgetsymval(const char *, kdb_symtab_t *); | 150 | extern int kdbgetsymval(const char *, kdb_symtab_t *); |
@@ -257,7 +255,14 @@ extern void kdb_ps1(const struct task_struct *p); | |||
257 | extern void kdb_print_nameval(const char *name, unsigned long val); | 255 | extern void kdb_print_nameval(const char *name, unsigned long val); |
258 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 256 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
259 | extern void kdb_meminfo_proc_show(void); | 257 | extern void kdb_meminfo_proc_show(void); |
258 | #ifdef CONFIG_KALLSYMS | ||
260 | extern const char *kdb_walk_kallsyms(loff_t *pos); | 259 | extern const char *kdb_walk_kallsyms(loff_t *pos); |
260 | #else /* ! CONFIG_KALLSYMS */ | ||
261 | static inline const char *kdb_walk_kallsyms(loff_t *pos) | ||
262 | { | ||
263 | return NULL; | ||
264 | } | ||
265 | #endif /* ! CONFIG_KALLSYMS */ | ||
261 | extern char *kdb_getstr(char *, size_t, char *); | 266 | extern char *kdb_getstr(char *, size_t, char *); |
262 | 267 | ||
263 | /* Defines for kdb_symbol_print */ | 268 | /* Defines for kdb_symbol_print */ |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 45344d5c53dd..6b2485dcb050 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */ | |||
82 | int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) | 82 | int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) |
83 | { | 83 | { |
84 | int ret = 0; | 84 | int ret = 0; |
85 | unsigned long symbolsize; | 85 | unsigned long symbolsize = 0; |
86 | unsigned long offset; | 86 | unsigned long offset = 0; |
87 | #define knt1_size 128 /* must be >= kallsyms table size */ | 87 | #define knt1_size 128 /* must be >= kallsyms table size */ |
88 | char *knt1 = NULL; | 88 | char *knt1 = NULL; |
89 | 89 | ||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index dd62f8e714ca..0dbeae374225 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -134,23 +134,14 @@ unregister: | |||
134 | return 0; | 134 | return 0; |
135 | } | 135 | } |
136 | 136 | ||
137 | int | 137 | int __set_personality(unsigned int personality) |
138 | __set_personality(unsigned int personality) | ||
139 | { | 138 | { |
140 | struct exec_domain *ep, *oep; | 139 | struct exec_domain *oep = current_thread_info()->exec_domain; |
141 | |||
142 | ep = lookup_exec_domain(personality); | ||
143 | if (ep == current_thread_info()->exec_domain) { | ||
144 | current->personality = personality; | ||
145 | module_put(ep->module); | ||
146 | return 0; | ||
147 | } | ||
148 | 140 | ||
141 | current_thread_info()->exec_domain = lookup_exec_domain(personality); | ||
149 | current->personality = personality; | 142 | current->personality = personality; |
150 | oep = current_thread_info()->exec_domain; | ||
151 | current_thread_info()->exec_domain = ep; | ||
152 | |||
153 | module_put(oep->module); | 143 | module_put(oep->module); |
144 | |||
154 | return 0; | 145 | return 0; |
155 | } | 146 | } |
156 | 147 | ||
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) | |||
192 | { | 183 | { |
193 | unsigned int old = current->personality; | 184 | unsigned int old = current->personality; |
194 | 185 | ||
195 | if (personality != 0xffffffff) { | 186 | if (personality != 0xffffffff) |
196 | set_personality(personality); | 187 | set_personality(personality); |
197 | if (current->personality != personality) | ||
198 | return -EINVAL; | ||
199 | } | ||
200 | 188 | ||
201 | return old; | 189 | return old; |
202 | } | 190 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67b564a..03120229db28 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father) | |||
771 | struct task_struct *p, *n, *reaper; | 771 | struct task_struct *p, *n, *reaper; |
772 | LIST_HEAD(dead_children); | 772 | LIST_HEAD(dead_children); |
773 | 773 | ||
774 | exit_ptrace(father); | ||
775 | |||
776 | write_lock_irq(&tasklist_lock); | 774 | write_lock_irq(&tasklist_lock); |
775 | /* | ||
776 | * Note that exit_ptrace() and find_new_reaper() might | ||
777 | * drop tasklist_lock and reacquire it. | ||
778 | */ | ||
779 | exit_ptrace(father); | ||
777 | reaper = find_new_reaper(father); | 780 | reaper = find_new_reaper(father); |
778 | 781 | ||
779 | list_for_each_entry_safe(p, n, &father->children, sibling) { | 782 | list_for_each_entry_safe(p, n, &father->children, sibling) { |
@@ -1383,8 +1386,7 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
1383 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1386 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1384 | *p_code = 0; | 1387 | *p_code = 0; |
1385 | 1388 | ||
1386 | /* don't need the RCU readlock here as we're holding a spinlock */ | 1389 | uid = task_uid(p); |
1387 | uid = __task_cred(p)->uid; | ||
1388 | unlock_sig: | 1390 | unlock_sig: |
1389 | spin_unlock_irq(&p->sighand->siglock); | 1391 | spin_unlock_irq(&p->sighand->siglock); |
1390 | if (!exit_code) | 1392 | if (!exit_code) |
@@ -1457,7 +1459,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
1457 | } | 1459 | } |
1458 | if (!unlikely(wo->wo_flags & WNOWAIT)) | 1460 | if (!unlikely(wo->wo_flags & WNOWAIT)) |
1459 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; | 1461 | p->signal->flags &= ~SIGNAL_STOP_CONTINUED; |
1460 | uid = __task_cred(p)->uid; | 1462 | uid = task_uid(p); |
1461 | spin_unlock_irq(&p->sighand->siglock); | 1463 | spin_unlock_irq(&p->sighand->siglock); |
1462 | 1464 | ||
1463 | pid = task_pid_vnr(p); | 1465 | pid = task_pid_vnr(p); |
diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba047..c445f8cc408d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -300,7 +300,7 @@ out: | |||
300 | #ifdef CONFIG_MMU | 300 | #ifdef CONFIG_MMU |
301 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | 301 | static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
302 | { | 302 | { |
303 | struct vm_area_struct *mpnt, *tmp, **pprev; | 303 | struct vm_area_struct *mpnt, *tmp, *prev, **pprev; |
304 | struct rb_node **rb_link, *rb_parent; | 304 | struct rb_node **rb_link, *rb_parent; |
305 | int retval; | 305 | int retval; |
306 | unsigned long charge; | 306 | unsigned long charge; |
@@ -328,6 +328,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
328 | if (retval) | 328 | if (retval) |
329 | goto out; | 329 | goto out; |
330 | 330 | ||
331 | prev = NULL; | ||
331 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 332 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
332 | struct file *file; | 333 | struct file *file; |
333 | 334 | ||
@@ -355,11 +356,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
355 | if (IS_ERR(pol)) | 356 | if (IS_ERR(pol)) |
356 | goto fail_nomem_policy; | 357 | goto fail_nomem_policy; |
357 | vma_set_policy(tmp, pol); | 358 | vma_set_policy(tmp, pol); |
359 | tmp->vm_mm = mm; | ||
358 | if (anon_vma_fork(tmp, mpnt)) | 360 | if (anon_vma_fork(tmp, mpnt)) |
359 | goto fail_nomem_anon_vma_fork; | 361 | goto fail_nomem_anon_vma_fork; |
360 | tmp->vm_flags &= ~VM_LOCKED; | 362 | tmp->vm_flags &= ~VM_LOCKED; |
361 | tmp->vm_mm = mm; | 363 | tmp->vm_next = tmp->vm_prev = NULL; |
362 | tmp->vm_next = NULL; | ||
363 | file = tmp->vm_file; | 364 | file = tmp->vm_file; |
364 | if (file) { | 365 | if (file) { |
365 | struct inode *inode = file->f_path.dentry->d_inode; | 366 | struct inode *inode = file->f_path.dentry->d_inode; |
@@ -392,6 +393,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
392 | */ | 393 | */ |
393 | *pprev = tmp; | 394 | *pprev = tmp; |
394 | pprev = &tmp->vm_next; | 395 | pprev = &tmp->vm_next; |
396 | tmp->vm_prev = prev; | ||
397 | prev = tmp; | ||
395 | 398 | ||
396 | __vma_link_rb(mm, tmp, rb_link, rb_parent); | 399 | __vma_link_rb(mm, tmp, rb_link, rb_parent); |
397 | rb_link = &tmp->vm_rb.rb_right; | 400 | rb_link = &tmp->vm_rb.rb_right; |
@@ -752,13 +755,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) | |||
752 | struct fs_struct *fs = current->fs; | 755 | struct fs_struct *fs = current->fs; |
753 | if (clone_flags & CLONE_FS) { | 756 | if (clone_flags & CLONE_FS) { |
754 | /* tsk->fs is already what we want */ | 757 | /* tsk->fs is already what we want */ |
755 | write_lock(&fs->lock); | 758 | spin_lock(&fs->lock); |
756 | if (fs->in_exec) { | 759 | if (fs->in_exec) { |
757 | write_unlock(&fs->lock); | 760 | spin_unlock(&fs->lock); |
758 | return -EAGAIN; | 761 | return -EAGAIN; |
759 | } | 762 | } |
760 | fs->users++; | 763 | fs->users++; |
761 | write_unlock(&fs->lock); | 764 | spin_unlock(&fs->lock); |
762 | return 0; | 765 | return 0; |
763 | } | 766 | } |
764 | tsk->fs = copy_fs_struct(fs); | 767 | tsk->fs = copy_fs_struct(fs); |
@@ -899,6 +902,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
899 | tty_audit_fork(sig); | 902 | tty_audit_fork(sig); |
900 | 903 | ||
901 | sig->oom_adj = current->signal->oom_adj; | 904 | sig->oom_adj = current->signal->oom_adj; |
905 | sig->oom_score_adj = current->signal->oom_score_adj; | ||
902 | 906 | ||
903 | return 0; | 907 | return 0; |
904 | } | 908 | } |
@@ -907,7 +911,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
907 | { | 911 | { |
908 | unsigned long new_flags = p->flags; | 912 | unsigned long new_flags = p->flags; |
909 | 913 | ||
910 | new_flags &= ~PF_SUPERPRIV; | 914 | new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); |
911 | new_flags |= PF_FORKNOEXEC; | 915 | new_flags |= PF_FORKNOEXEC; |
912 | new_flags |= PF_STARTING; | 916 | new_flags |= PF_STARTING; |
913 | p->flags = new_flags; | 917 | p->flags = new_flags; |
@@ -1675,13 +1679,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1675 | 1679 | ||
1676 | if (new_fs) { | 1680 | if (new_fs) { |
1677 | fs = current->fs; | 1681 | fs = current->fs; |
1678 | write_lock(&fs->lock); | 1682 | spin_lock(&fs->lock); |
1679 | current->fs = new_fs; | 1683 | current->fs = new_fs; |
1680 | if (--fs->users) | 1684 | if (--fs->users) |
1681 | new_fs = NULL; | 1685 | new_fs = NULL; |
1682 | else | 1686 | else |
1683 | new_fs = fs; | 1687 | new_fs = fs; |
1684 | write_unlock(&fs->lock); | 1688 | spin_unlock(&fs->lock); |
1685 | } | 1689 | } |
1686 | 1690 | ||
1687 | if (new_mm) { | 1691 | if (new_mm) { |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f88a7a3..f83972b16564 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -33,10 +33,11 @@ | |||
33 | * @children: child nodes | 33 | * @children: child nodes |
34 | * @all: list head for list of all nodes | 34 | * @all: list head for list of all nodes |
35 | * @parent: parent node | 35 | * @parent: parent node |
36 | * @info: associated profiling data structure if not a directory | 36 | * @loaded_info: array of pointers to profiling data sets for loaded object |
37 | * @ghost: when an object file containing profiling data is unloaded we keep a | 37 | * files. |
38 | * copy of the profiling data here to allow collecting coverage data | 38 | * @num_loaded: number of profiling data sets for loaded object files. |
39 | * for cleanup code. Such a node is called a "ghost". | 39 | * @unloaded_info: accumulated copy of profiling data sets for unloaded |
40 | * object files. Used only when gcov_persist=1. | ||
40 | * @dentry: main debugfs entry, either a directory or data file | 41 | * @dentry: main debugfs entry, either a directory or data file |
41 | * @links: associated symbolic links | 42 | * @links: associated symbolic links |
42 | * @name: data file basename | 43 | * @name: data file basename |
@@ -51,10 +52,11 @@ struct gcov_node { | |||
51 | struct list_head children; | 52 | struct list_head children; |
52 | struct list_head all; | 53 | struct list_head all; |
53 | struct gcov_node *parent; | 54 | struct gcov_node *parent; |
54 | struct gcov_info *info; | 55 | struct gcov_info **loaded_info; |
55 | struct gcov_info *ghost; | 56 | struct gcov_info *unloaded_info; |
56 | struct dentry *dentry; | 57 | struct dentry *dentry; |
57 | struct dentry **links; | 58 | struct dentry **links; |
59 | int num_loaded; | ||
58 | char name[0]; | 60 | char name[0]; |
59 | }; | 61 | }; |
60 | 62 | ||
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { | |||
136 | }; | 138 | }; |
137 | 139 | ||
138 | /* | 140 | /* |
139 | * Return the profiling data set for a given node. This can either be the | 141 | * Return a profiling data set associated with the given node. This is |
140 | * original profiling data structure or a duplicate (also called "ghost") | 142 | * either a data set for a loaded object file or a data set copy in case |
141 | * in case the associated object file has been unloaded. | 143 | * all associated object files have been unloaded. |
142 | */ | 144 | */ |
143 | static struct gcov_info *get_node_info(struct gcov_node *node) | 145 | static struct gcov_info *get_node_info(struct gcov_node *node) |
144 | { | 146 | { |
145 | if (node->info) | 147 | if (node->num_loaded > 0) |
146 | return node->info; | 148 | return node->loaded_info[0]; |
147 | 149 | ||
148 | return node->ghost; | 150 | return node->unloaded_info; |
151 | } | ||
152 | |||
153 | /* | ||
154 | * Return a newly allocated profiling data set which contains the sum of | ||
155 | * all profiling data associated with the given node. | ||
156 | */ | ||
157 | static struct gcov_info *get_accumulated_info(struct gcov_node *node) | ||
158 | { | ||
159 | struct gcov_info *info; | ||
160 | int i = 0; | ||
161 | |||
162 | if (node->unloaded_info) | ||
163 | info = gcov_info_dup(node->unloaded_info); | ||
164 | else | ||
165 | info = gcov_info_dup(node->loaded_info[i++]); | ||
166 | if (!info) | ||
167 | return NULL; | ||
168 | for (; i < node->num_loaded; i++) | ||
169 | gcov_info_add(info, node->loaded_info[i]); | ||
170 | |||
171 | return info; | ||
149 | } | 172 | } |
150 | 173 | ||
151 | /* | 174 | /* |
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) | |||
163 | mutex_lock(&node_lock); | 186 | mutex_lock(&node_lock); |
164 | /* | 187 | /* |
165 | * Read from a profiling data copy to minimize reference tracking | 188 | * Read from a profiling data copy to minimize reference tracking |
166 | * complexity and concurrent access. | 189 | * complexity and concurrent access and to keep accumulating multiple |
190 | * profiling data sets associated with one node simple. | ||
167 | */ | 191 | */ |
168 | info = gcov_info_dup(get_node_info(node)); | 192 | info = get_accumulated_info(node); |
169 | if (!info) | 193 | if (!info) |
170 | goto out_unlock; | 194 | goto out_unlock; |
171 | iter = gcov_iter_new(info); | 195 | iter = gcov_iter_new(info); |
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) | |||
225 | return NULL; | 249 | return NULL; |
226 | } | 250 | } |
227 | 251 | ||
252 | /* | ||
253 | * Reset all profiling data associated with the specified node. | ||
254 | */ | ||
255 | static void reset_node(struct gcov_node *node) | ||
256 | { | ||
257 | int i; | ||
258 | |||
259 | if (node->unloaded_info) | ||
260 | gcov_info_reset(node->unloaded_info); | ||
261 | for (i = 0; i < node->num_loaded; i++) | ||
262 | gcov_info_reset(node->loaded_info[i]); | ||
263 | } | ||
264 | |||
228 | static void remove_node(struct gcov_node *node); | 265 | static void remove_node(struct gcov_node *node); |
229 | 266 | ||
230 | /* | 267 | /* |
231 | * write() implementation for gcov data files. Reset profiling data for the | 268 | * write() implementation for gcov data files. Reset profiling data for the |
232 | * associated file. If the object file has been unloaded (i.e. this is | 269 | * corresponding file. If all associated object files have been unloaded, |
233 | * a "ghost" node), remove the debug fs node as well. | 270 | * remove the debug fs node as well. |
234 | */ | 271 | */ |
235 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | 272 | static ssize_t gcov_seq_write(struct file *file, const char __user *addr, |
236 | size_t len, loff_t *pos) | 273 | size_t len, loff_t *pos) |
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, | |||
245 | node = get_node_by_name(info->filename); | 282 | node = get_node_by_name(info->filename); |
246 | if (node) { | 283 | if (node) { |
247 | /* Reset counts or remove node for unloaded modules. */ | 284 | /* Reset counts or remove node for unloaded modules. */ |
248 | if (node->ghost) | 285 | if (node->num_loaded == 0) |
249 | remove_node(node); | 286 | remove_node(node); |
250 | else | 287 | else |
251 | gcov_info_reset(node->info); | 288 | reset_node(node); |
252 | } | 289 | } |
253 | /* Reset counts for open file. */ | 290 | /* Reset counts for open file. */ |
254 | gcov_info_reset(info); | 291 | gcov_info_reset(info); |
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, | |||
378 | INIT_LIST_HEAD(&node->list); | 415 | INIT_LIST_HEAD(&node->list); |
379 | INIT_LIST_HEAD(&node->children); | 416 | INIT_LIST_HEAD(&node->children); |
380 | INIT_LIST_HEAD(&node->all); | 417 | INIT_LIST_HEAD(&node->all); |
381 | node->info = info; | 418 | if (node->loaded_info) { |
419 | node->loaded_info[0] = info; | ||
420 | node->num_loaded = 1; | ||
421 | } | ||
382 | node->parent = parent; | 422 | node->parent = parent; |
383 | if (name) | 423 | if (name) |
384 | strcpy(node->name, name); | 424 | strcpy(node->name, name); |
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
394 | struct gcov_node *node; | 434 | struct gcov_node *node; |
395 | 435 | ||
396 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); | 436 | node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); |
397 | if (!node) { | 437 | if (!node) |
398 | pr_warning("out of memory\n"); | 438 | goto err_nomem; |
399 | return NULL; | 439 | if (info) { |
440 | node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), | ||
441 | GFP_KERNEL); | ||
442 | if (!node->loaded_info) | ||
443 | goto err_nomem; | ||
400 | } | 444 | } |
401 | init_node(node, info, name, parent); | 445 | init_node(node, info, name, parent); |
402 | /* Differentiate between gcov data file nodes and directory nodes. */ | 446 | /* Differentiate between gcov data file nodes and directory nodes. */ |
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
416 | list_add(&node->all, &all_head); | 460 | list_add(&node->all, &all_head); |
417 | 461 | ||
418 | return node; | 462 | return node; |
463 | |||
464 | err_nomem: | ||
465 | kfree(node); | ||
466 | pr_warning("out of memory\n"); | ||
467 | return NULL; | ||
419 | } | 468 | } |
420 | 469 | ||
421 | /* Remove symbolic links associated with node. */ | 470 | /* Remove symbolic links associated with node. */ |
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) | |||
441 | list_del(&node->all); | 490 | list_del(&node->all); |
442 | debugfs_remove(node->dentry); | 491 | debugfs_remove(node->dentry); |
443 | remove_links(node); | 492 | remove_links(node); |
444 | if (node->ghost) | 493 | kfree(node->loaded_info); |
445 | gcov_info_free(node->ghost); | 494 | if (node->unloaded_info) |
495 | gcov_info_free(node->unloaded_info); | ||
446 | kfree(node); | 496 | kfree(node); |
447 | } | 497 | } |
448 | 498 | ||
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, | |||
477 | 527 | ||
478 | /* | 528 | /* |
479 | * write() implementation for reset file. Reset all profiling data to zero | 529 | * write() implementation for reset file. Reset all profiling data to zero |
480 | * and remove ghost nodes. | 530 | * and remove nodes for which all associated object files are unloaded. |
481 | */ | 531 | */ |
482 | static ssize_t reset_write(struct file *file, const char __user *addr, | 532 | static ssize_t reset_write(struct file *file, const char __user *addr, |
483 | size_t len, loff_t *pos) | 533 | size_t len, loff_t *pos) |
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, | |||
487 | mutex_lock(&node_lock); | 537 | mutex_lock(&node_lock); |
488 | restart: | 538 | restart: |
489 | list_for_each_entry(node, &all_head, all) { | 539 | list_for_each_entry(node, &all_head, all) { |
490 | if (node->info) | 540 | if (node->num_loaded > 0) |
491 | gcov_info_reset(node->info); | 541 | reset_node(node); |
492 | else if (list_empty(&node->children)) { | 542 | else if (list_empty(&node->children)) { |
493 | remove_node(node); | 543 | remove_node(node); |
494 | /* Several nodes may have gone - restart loop. */ | 544 | /* Several nodes may have gone - restart loop. */ |
@@ -564,37 +614,115 @@ err_remove: | |||
564 | } | 614 | } |
565 | 615 | ||
566 | /* | 616 | /* |
567 | * The profiling data set associated with this node is being unloaded. Store a | 617 | * Associate a profiling data set with an existing node. Needs to be called |
568 | * copy of the profiling data and turn this node into a "ghost". | 618 | * with node_lock held. |
569 | */ | 619 | */ |
570 | static int ghost_node(struct gcov_node *node) | 620 | static void add_info(struct gcov_node *node, struct gcov_info *info) |
571 | { | 621 | { |
572 | node->ghost = gcov_info_dup(node->info); | 622 | struct gcov_info **loaded_info; |
573 | if (!node->ghost) { | 623 | int num = node->num_loaded; |
574 | pr_warning("could not save data for '%s' (out of memory)\n", | 624 | |
575 | node->info->filename); | 625 | /* |
576 | return -ENOMEM; | 626 | * Prepare new array. This is done first to simplify cleanup in |
627 | * case the new data set is incompatible, the node only contains | ||
628 | * unloaded data sets and there's not enough memory for the array. | ||
629 | */ | ||
630 | loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); | ||
631 | if (!loaded_info) { | ||
632 | pr_warning("could not add '%s' (out of memory)\n", | ||
633 | info->filename); | ||
634 | return; | ||
635 | } | ||
636 | memcpy(loaded_info, node->loaded_info, | ||
637 | num * sizeof(struct gcov_info *)); | ||
638 | loaded_info[num] = info; | ||
639 | /* Check if the new data set is compatible. */ | ||
640 | if (num == 0) { | ||
641 | /* | ||
642 | * A module was unloaded, modified and reloaded. The new | ||
643 | * data set replaces the copy of the last one. | ||
644 | */ | ||
645 | if (!gcov_info_is_compatible(node->unloaded_info, info)) { | ||
646 | pr_warning("discarding saved data for %s " | ||
647 | "(incompatible version)\n", info->filename); | ||
648 | gcov_info_free(node->unloaded_info); | ||
649 | node->unloaded_info = NULL; | ||
650 | } | ||
651 | } else { | ||
652 | /* | ||
653 | * Two different versions of the same object file are loaded. | ||
654 | * The initial one takes precedence. | ||
655 | */ | ||
656 | if (!gcov_info_is_compatible(node->loaded_info[0], info)) { | ||
657 | pr_warning("could not add '%s' (incompatible " | ||
658 | "version)\n", info->filename); | ||
659 | kfree(loaded_info); | ||
660 | return; | ||
661 | } | ||
577 | } | 662 | } |
578 | node->info = NULL; | 663 | /* Overwrite previous array. */ |
664 | kfree(node->loaded_info); | ||
665 | node->loaded_info = loaded_info; | ||
666 | node->num_loaded = num + 1; | ||
667 | } | ||
579 | 668 | ||
580 | return 0; | 669 | /* |
670 | * Return the index of a profiling data set associated with a node. | ||
671 | */ | ||
672 | static int get_info_index(struct gcov_node *node, struct gcov_info *info) | ||
673 | { | ||
674 | int i; | ||
675 | |||
676 | for (i = 0; i < node->num_loaded; i++) { | ||
677 | if (node->loaded_info[i] == info) | ||
678 | return i; | ||
679 | } | ||
680 | return -ENOENT; | ||
581 | } | 681 | } |
582 | 682 | ||
583 | /* | 683 | /* |
584 | * Profiling data for this node has been loaded again. Add profiling data | 684 | * Save the data of a profiling data set which is being unloaded. |
585 | * from previous instantiation and turn this node into a regular node. | ||
586 | */ | 685 | */ |
587 | static void revive_node(struct gcov_node *node, struct gcov_info *info) | 686 | static void save_info(struct gcov_node *node, struct gcov_info *info) |
588 | { | 687 | { |
589 | if (gcov_info_is_compatible(node->ghost, info)) | 688 | if (node->unloaded_info) |
590 | gcov_info_add(info, node->ghost); | 689 | gcov_info_add(node->unloaded_info, info); |
591 | else { | 690 | else { |
592 | pr_warning("discarding saved data for '%s' (version changed)\n", | 691 | node->unloaded_info = gcov_info_dup(info); |
692 | if (!node->unloaded_info) { | ||
693 | pr_warning("could not save data for '%s' " | ||
694 | "(out of memory)\n", info->filename); | ||
695 | } | ||
696 | } | ||
697 | } | ||
698 | |||
699 | /* | ||
700 | * Disassociate a profiling data set from a node. Needs to be called with | ||
701 | * node_lock held. | ||
702 | */ | ||
703 | static void remove_info(struct gcov_node *node, struct gcov_info *info) | ||
704 | { | ||
705 | int i; | ||
706 | |||
707 | i = get_info_index(node, info); | ||
708 | if (i < 0) { | ||
709 | pr_warning("could not remove '%s' (not found)\n", | ||
593 | info->filename); | 710 | info->filename); |
711 | return; | ||
594 | } | 712 | } |
595 | gcov_info_free(node->ghost); | 713 | if (gcov_persist) |
596 | node->ghost = NULL; | 714 | save_info(node, info); |
597 | node->info = info; | 715 | /* Shrink array. */ |
716 | node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; | ||
717 | node->num_loaded--; | ||
718 | if (node->num_loaded > 0) | ||
719 | return; | ||
720 | /* Last loaded data set was removed. */ | ||
721 | kfree(node->loaded_info); | ||
722 | node->loaded_info = NULL; | ||
723 | node->num_loaded = 0; | ||
724 | if (!node->unloaded_info) | ||
725 | remove_node(node); | ||
598 | } | 726 | } |
599 | 727 | ||
600 | /* | 728 | /* |
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
609 | node = get_node_by_name(info->filename); | 737 | node = get_node_by_name(info->filename); |
610 | switch (action) { | 738 | switch (action) { |
611 | case GCOV_ADD: | 739 | case GCOV_ADD: |
612 | /* Add new node or revive ghost. */ | 740 | if (node) |
613 | if (!node) { | 741 | add_info(node, info); |
742 | else | ||
614 | add_node(info); | 743 | add_node(info); |
615 | break; | ||
616 | } | ||
617 | if (gcov_persist) | ||
618 | revive_node(node, info); | ||
619 | else { | ||
620 | pr_warning("could not add '%s' (already exists)\n", | ||
621 | info->filename); | ||
622 | } | ||
623 | break; | 744 | break; |
624 | case GCOV_REMOVE: | 745 | case GCOV_REMOVE: |
625 | /* Remove node or turn into ghost. */ | 746 | if (node) |
626 | if (!node) { | 747 | remove_info(node, info); |
748 | else { | ||
627 | pr_warning("could not remove '%s' (not found)\n", | 749 | pr_warning("could not remove '%s' (not found)\n", |
628 | info->filename); | 750 | info->filename); |
629 | break; | ||
630 | } | 751 | } |
631 | if (gcov_persist) { | ||
632 | if (!ghost_node(node)) | ||
633 | break; | ||
634 | } | ||
635 | remove_node(node); | ||
636 | break; | 752 | break; |
637 | } | 753 | } |
638 | mutex_unlock(&node_lock); | 754 | mutex_unlock(&node_lock); |
diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916c9492..253dc0f35cf4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) | |||
143 | right = group_info->ngroups; | 143 | right = group_info->ngroups; |
144 | while (left < right) { | 144 | while (left < right) { |
145 | unsigned int mid = (left+right)/2; | 145 | unsigned int mid = (left+right)/2; |
146 | int cmp = grp - GROUP_AT(group_info, mid); | 146 | if (grp > GROUP_AT(group_info, mid)) |
147 | if (cmp > 0) | ||
148 | left = mid + 1; | 147 | left = mid + 1; |
149 | else if (cmp < 0) | 148 | else if (grp < GROUP_AT(group_info, mid)) |
150 | right = mid; | 149 | right = mid; |
151 | else | 150 | else |
152 | return 1; | 151 | return 1; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..1decafbb6b1a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
90 | do { | 90 | do { |
91 | seq = read_seqbegin(&xtime_lock); | 91 | seq = read_seqbegin(&xtime_lock); |
92 | xts = __current_kernel_time(); | 92 | xts = __current_kernel_time(); |
93 | tom = wall_to_monotonic; | 93 | tom = __get_wall_to_monotonic(); |
94 | } while (read_seqretry(&xtime_lock, seq)); | 94 | } while (read_seqretry(&xtime_lock, seq)); |
95 | 95 | ||
96 | xtim = timespec_to_ktime(xts); | 96 | xtim = timespec_to_ktime(xts); |
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
144 | static int hrtimer_get_target(int this_cpu, int pinned) | 144 | static int hrtimer_get_target(int this_cpu, int pinned) |
145 | { | 145 | { |
146 | #ifdef CONFIG_NO_HZ | 146 | #ifdef CONFIG_NO_HZ |
147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { | 147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
148 | int preferred_cpu = get_nohz_load_balancer(); | 148 | return get_nohz_timer_target(); |
149 | |||
150 | if (preferred_cpu >= 0) | ||
151 | return preferred_cpu; | ||
152 | } | ||
153 | #endif | 149 | #endif |
154 | return this_cpu; | 150 | return this_cpu; |
155 | } | 151 | } |
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
612 | static void retrigger_next_event(void *arg) | 608 | static void retrigger_next_event(void *arg) |
613 | { | 609 | { |
614 | struct hrtimer_cpu_base *base; | 610 | struct hrtimer_cpu_base *base; |
615 | struct timespec realtime_offset; | 611 | struct timespec realtime_offset, wtm; |
616 | unsigned long seq; | 612 | unsigned long seq; |
617 | 613 | ||
618 | if (!hrtimer_hres_active()) | 614 | if (!hrtimer_hres_active()) |
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) | |||
620 | 616 | ||
621 | do { | 617 | do { |
622 | seq = read_seqbegin(&xtime_lock); | 618 | seq = read_seqbegin(&xtime_lock); |
623 | set_normalized_timespec(&realtime_offset, | 619 | wtm = __get_wall_to_monotonic(); |
624 | -wall_to_monotonic.tv_sec, | ||
625 | -wall_to_monotonic.tv_nsec); | ||
626 | } while (read_seqretry(&xtime_lock, seq)); | 620 | } while (read_seqretry(&xtime_lock, seq)); |
621 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
627 | 622 | ||
628 | base = &__get_cpu_var(hrtimer_bases); | 623 | base = &__get_cpu_var(hrtimer_bases); |
629 | 624 | ||
@@ -1096,11 +1091,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
1096 | */ | 1091 | */ |
1097 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 1092 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
1098 | { | 1093 | { |
1099 | struct hrtimer_clock_base *base; | ||
1100 | unsigned long flags; | 1094 | unsigned long flags; |
1101 | ktime_t rem; | 1095 | ktime_t rem; |
1102 | 1096 | ||
1103 | base = lock_hrtimer_base(timer, &flags); | 1097 | lock_hrtimer_base(timer, &flags); |
1104 | rem = hrtimer_expires_remaining(timer); | 1098 | rem = hrtimer_expires_remaining(timer); |
1105 | unlock_hrtimer_base(timer, &flags); | 1099 | unlock_hrtimer_base(timer, &flags); |
1106 | 1100 | ||
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22e0602..c7c2aed9e2dc 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/list.h> | ||
44 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
45 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
46 | 47 | ||
@@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); | |||
62 | 63 | ||
63 | static int nr_slots[TYPE_MAX]; | 64 | static int nr_slots[TYPE_MAX]; |
64 | 65 | ||
66 | /* Keep track of the breakpoints attached to tasks */ | ||
67 | static LIST_HEAD(bp_task_head); | ||
68 | |||
65 | static int constraints_initialized; | 69 | static int constraints_initialized; |
66 | 70 | ||
67 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ | 71 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ |
@@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
103 | return 0; | 107 | return 0; |
104 | } | 108 | } |
105 | 109 | ||
106 | static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) | 110 | /* |
111 | * Count the number of breakpoints of the same type and same task. | ||
112 | * The given event must be not on the list. | ||
113 | */ | ||
114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | ||
107 | { | 115 | { |
108 | struct perf_event_context *ctx = tsk->perf_event_ctxp; | 116 | struct perf_event_context *ctx = bp->ctx; |
109 | struct list_head *list; | 117 | struct perf_event *iter; |
110 | struct perf_event *bp; | ||
111 | unsigned long flags; | ||
112 | int count = 0; | 118 | int count = 0; |
113 | 119 | ||
114 | if (WARN_ONCE(!ctx, "No perf context for this task")) | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
115 | return 0; | 121 | if (iter->ctx == ctx && find_slot_idx(iter) == type) |
116 | 122 | count += hw_breakpoint_weight(iter); | |
117 | list = &ctx->event_list; | ||
118 | |||
119 | raw_spin_lock_irqsave(&ctx->lock, flags); | ||
120 | |||
121 | /* | ||
122 | * The current breakpoint counter is not included in the list | ||
123 | * at the open() callback time | ||
124 | */ | ||
125 | list_for_each_entry(bp, list, event_entry) { | ||
126 | if (bp->attr.type == PERF_TYPE_BREAKPOINT) | ||
127 | if (find_slot_idx(bp) == type) | ||
128 | count += hw_breakpoint_weight(bp); | ||
129 | } | 123 | } |
130 | 124 | ||
131 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | ||
132 | |||
133 | return count; | 125 | return count; |
134 | } | 126 | } |
135 | 127 | ||
@@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
149 | if (!tsk) | 141 | if (!tsk) |
150 | slots->pinned += max_task_bp_pinned(cpu, type); | 142 | slots->pinned += max_task_bp_pinned(cpu, type); |
151 | else | 143 | else |
152 | slots->pinned += task_bp_pinned(tsk, type); | 144 | slots->pinned += task_bp_pinned(bp, type); |
153 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | 145 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); |
154 | 146 | ||
155 | return; | 147 | return; |
@@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
162 | if (!tsk) | 154 | if (!tsk) |
163 | nr += max_task_bp_pinned(cpu, type); | 155 | nr += max_task_bp_pinned(cpu, type); |
164 | else | 156 | else |
165 | nr += task_bp_pinned(tsk, type); | 157 | nr += task_bp_pinned(bp, type); |
166 | 158 | ||
167 | if (nr > slots->pinned) | 159 | if (nr > slots->pinned) |
168 | slots->pinned = nr; | 160 | slots->pinned = nr; |
@@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) | |||
188 | /* | 180 | /* |
189 | * Add a pinned breakpoint for the given task in our constraint table | 181 | * Add a pinned breakpoint for the given task in our constraint table |
190 | */ | 182 | */ |
191 | static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | 183 | static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, |
192 | enum bp_type_idx type, int weight) | 184 | enum bp_type_idx type, int weight) |
193 | { | 185 | { |
194 | unsigned int *tsk_pinned; | 186 | unsigned int *tsk_pinned; |
@@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | |||
196 | int old_idx = 0; | 188 | int old_idx = 0; |
197 | int idx = 0; | 189 | int idx = 0; |
198 | 190 | ||
199 | old_count = task_bp_pinned(tsk, type); | 191 | old_count = task_bp_pinned(bp, type); |
200 | old_idx = old_count - 1; | 192 | old_idx = old_count - 1; |
201 | idx = old_idx + weight; | 193 | idx = old_idx + weight; |
202 | 194 | ||
195 | /* tsk_pinned[n] is the number of tasks having n breakpoints */ | ||
203 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); | 196 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); |
204 | if (enable) { | 197 | if (enable) { |
205 | tsk_pinned[idx]++; | 198 | tsk_pinned[idx]++; |
@@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
222 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
223 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->ctx->task; |
224 | 217 | ||
218 | /* Pinned counter cpu profiling */ | ||
219 | if (!tsk) { | ||
220 | |||
221 | if (enable) | ||
222 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | ||
223 | else | ||
224 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | ||
225 | return; | ||
226 | } | ||
227 | |||
225 | /* Pinned counter task profiling */ | 228 | /* Pinned counter task profiling */ |
226 | if (tsk) { | ||
227 | if (cpu >= 0) { | ||
228 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | ||
229 | return; | ||
230 | } | ||
231 | 229 | ||
230 | if (!enable) | ||
231 | list_del(&bp->hw.bp_list); | ||
232 | |||
233 | if (cpu >= 0) { | ||
234 | toggle_bp_task_slot(bp, cpu, enable, type, weight); | ||
235 | } else { | ||
232 | for_each_online_cpu(cpu) | 236 | for_each_online_cpu(cpu) |
233 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | 237 | toggle_bp_task_slot(bp, cpu, enable, type, weight); |
234 | return; | ||
235 | } | 238 | } |
236 | 239 | ||
237 | /* Pinned counter cpu profiling */ | ||
238 | if (enable) | 240 | if (enable) |
239 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | 241 | list_add_tail(&bp->hw.bp_list, &bp_task_head); |
240 | else | 242 | } |
241 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | 243 | |
244 | /* | ||
245 | * Function to perform processor-specific cleanup during unregistration | ||
246 | */ | ||
247 | __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) | ||
248 | { | ||
249 | /* | ||
250 | * A weak stub function here for those archs that don't define | ||
251 | * it inside arch/.../kernel/hw_breakpoint.c | ||
252 | */ | ||
242 | } | 253 | } |
243 | 254 | ||
244 | /* | 255 | /* |
@@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) | |||
301 | weight = hw_breakpoint_weight(bp); | 312 | weight = hw_breakpoint_weight(bp); |
302 | 313 | ||
303 | fetch_bp_busy_slots(&slots, bp, type); | 314 | fetch_bp_busy_slots(&slots, bp, type); |
315 | /* | ||
316 | * Simulate the addition of this breakpoint to the constraints | ||
317 | * and see the result. | ||
318 | */ | ||
304 | fetch_this_slot(&slots, weight); | 319 | fetch_this_slot(&slots, weight); |
305 | 320 | ||
306 | /* Flexible counters need to keep at least one slot */ | 321 | /* Flexible counters need to keep at least one slot */ |
@@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) | |||
339 | { | 354 | { |
340 | mutex_lock(&nr_bp_mutex); | 355 | mutex_lock(&nr_bp_mutex); |
341 | 356 | ||
357 | arch_unregister_hw_breakpoint(bp); | ||
342 | __release_bp_slot(bp); | 358 | __release_bp_slot(bp); |
343 | 359 | ||
344 | mutex_unlock(&nr_bp_mutex); | 360 | mutex_unlock(&nr_bp_mutex); |
@@ -417,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, | |||
417 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
418 | struct task_struct *tsk) | 434 | struct task_struct *tsk) |
419 | { | 435 | { |
420 | return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); | 436 | return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), |
437 | triggered); | ||
421 | } | 438 | } |
422 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 439 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
423 | 440 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8a..c3003e9d91a3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
216 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | 216 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) |
217 | { | 217 | { |
218 | if (suspend) { | 218 | if (suspend) { |
219 | if (!desc->action || (desc->action->flags & IRQF_TIMER)) | 219 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
220 | return; | 220 | return; |
221 | desc->status |= IRQ_SUSPENDED; | 221 | desc->status |= IRQ_SUSPENDED; |
222 | } | 222 | } |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 131b1703936f..c0613f7d6730 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
151 | image->nr_segments = nr_segments; | 151 | image->nr_segments = nr_segments; |
152 | segment_bytes = nr_segments * sizeof(*segments); | 152 | segment_bytes = nr_segments * sizeof(*segments); |
153 | result = copy_from_user(image->segment, segments, segment_bytes); | 153 | result = copy_from_user(image->segment, segments, segment_bytes); |
154 | if (result) | 154 | if (result) { |
155 | result = -EFAULT; | ||
155 | goto out; | 156 | goto out; |
157 | } | ||
156 | 158 | ||
157 | /* | 159 | /* |
158 | * Verify we have good destination addresses. The caller is | 160 | * Verify we have good destination addresses. The caller is |
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image, | |||
827 | result = copy_from_user(ptr, buf, uchunk); | 829 | result = copy_from_user(ptr, buf, uchunk); |
828 | kunmap(page); | 830 | kunmap(page); |
829 | if (result) { | 831 | if (result) { |
830 | result = (result < 0) ? result : -EIO; | 832 | result = -EFAULT; |
831 | goto out; | 833 | goto out; |
832 | } | 834 | } |
833 | ubytes -= uchunk; | 835 | ubytes -= uchunk; |
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
882 | kexec_flush_icache_page(page); | 884 | kexec_flush_icache_page(page); |
883 | kunmap(page); | 885 | kunmap(page); |
884 | if (result) { | 886 | if (result) { |
885 | result = (result < 0) ? result : -EIO; | 887 | result = -EFAULT; |
886 | goto out; | 888 | goto out; |
887 | } | 889 | } |
888 | ubytes -= uchunk; | 890 | ubytes -= uchunk; |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 35edbe22e9a9..01a0700e873f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -1,8 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * A generic kernel FIFO implementation. | 2 | * A generic kernel FIFO implementation |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> | 4 | * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> |
5 | * Copyright (C) 2004 Stelian Pop <stelian@popies.net> | ||
6 | * | 5 | * |
7 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -11,7 +10,7 @@ | |||
11 | * | 10 | * |
12 | * This program is distributed in the hope that it will be useful, | 11 | * This program is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
16 | * | 15 | * |
17 | * You should have received a copy of the GNU General Public License | 16 | * You should have received a copy of the GNU General Public License |
@@ -24,422 +23,586 @@ | |||
24 | #include <linux/module.h> | 23 | #include <linux/module.h> |
25 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
26 | #include <linux/err.h> | 25 | #include <linux/err.h> |
27 | #include <linux/kfifo.h> | ||
28 | #include <linux/log2.h> | 26 | #include <linux/log2.h> |
29 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
28 | #include <linux/kfifo.h> | ||
30 | 29 | ||
31 | static void _kfifo_init(struct kfifo *fifo, void *buffer, | 30 | /* |
32 | unsigned int size) | 31 | * internal helper to calculate the unused elements in a fifo |
33 | { | ||
34 | fifo->buffer = buffer; | ||
35 | fifo->size = size; | ||
36 | |||
37 | kfifo_reset(fifo); | ||
38 | } | ||
39 | |||
40 | /** | ||
41 | * kfifo_init - initialize a FIFO using a preallocated buffer | ||
42 | * @fifo: the fifo to assign the buffer | ||
43 | * @buffer: the preallocated buffer to be used. | ||
44 | * @size: the size of the internal buffer, this has to be a power of 2. | ||
45 | * | ||
46 | */ | 32 | */ |
47 | void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) | 33 | static inline unsigned int kfifo_unused(struct __kfifo *fifo) |
48 | { | 34 | { |
49 | /* size must be a power of 2 */ | 35 | return (fifo->mask + 1) - (fifo->in - fifo->out); |
50 | BUG_ON(!is_power_of_2(size)); | ||
51 | |||
52 | _kfifo_init(fifo, buffer, size); | ||
53 | } | 36 | } |
54 | EXPORT_SYMBOL(kfifo_init); | ||
55 | 37 | ||
56 | /** | 38 | int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, |
57 | * kfifo_alloc - allocates a new FIFO internal buffer | 39 | size_t esize, gfp_t gfp_mask) |
58 | * @fifo: the fifo to assign then new buffer | ||
59 | * @size: the size of the buffer to be allocated, this have to be a power of 2. | ||
60 | * @gfp_mask: get_free_pages mask, passed to kmalloc() | ||
61 | * | ||
62 | * This function dynamically allocates a new fifo internal buffer | ||
63 | * | ||
64 | * The size will be rounded-up to a power of 2. | ||
65 | * The buffer will be release with kfifo_free(). | ||
66 | * Return 0 if no error, otherwise the an error code | ||
67 | */ | ||
68 | int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) | ||
69 | { | 40 | { |
70 | unsigned char *buffer; | ||
71 | |||
72 | /* | 41 | /* |
73 | * round up to the next power of 2, since our 'let the indices | 42 | * round down to the next power of 2, since our 'let the indices |
74 | * wrap' technique works only in this case. | 43 | * wrap' technique works only in this case. |
75 | */ | 44 | */ |
76 | if (!is_power_of_2(size)) { | 45 | if (!is_power_of_2(size)) |
77 | BUG_ON(size > 0x80000000); | 46 | size = rounddown_pow_of_two(size); |
78 | size = roundup_pow_of_two(size); | 47 | |
48 | fifo->in = 0; | ||
49 | fifo->out = 0; | ||
50 | fifo->esize = esize; | ||
51 | |||
52 | if (size < 2) { | ||
53 | fifo->data = NULL; | ||
54 | fifo->mask = 0; | ||
55 | return -EINVAL; | ||
79 | } | 56 | } |
80 | 57 | ||
81 | buffer = kmalloc(size, gfp_mask); | 58 | fifo->data = kmalloc(size * esize, gfp_mask); |
82 | if (!buffer) { | 59 | |
83 | _kfifo_init(fifo, NULL, 0); | 60 | if (!fifo->data) { |
61 | fifo->mask = 0; | ||
84 | return -ENOMEM; | 62 | return -ENOMEM; |
85 | } | 63 | } |
86 | 64 | fifo->mask = size - 1; | |
87 | _kfifo_init(fifo, buffer, size); | ||
88 | 65 | ||
89 | return 0; | 66 | return 0; |
90 | } | 67 | } |
91 | EXPORT_SYMBOL(kfifo_alloc); | 68 | EXPORT_SYMBOL(__kfifo_alloc); |
92 | 69 | ||
93 | /** | 70 | void __kfifo_free(struct __kfifo *fifo) |
94 | * kfifo_free - frees the FIFO internal buffer | ||
95 | * @fifo: the fifo to be freed. | ||
96 | */ | ||
97 | void kfifo_free(struct kfifo *fifo) | ||
98 | { | 71 | { |
99 | kfree(fifo->buffer); | 72 | kfree(fifo->data); |
100 | _kfifo_init(fifo, NULL, 0); | 73 | fifo->in = 0; |
74 | fifo->out = 0; | ||
75 | fifo->esize = 0; | ||
76 | fifo->data = NULL; | ||
77 | fifo->mask = 0; | ||
101 | } | 78 | } |
102 | EXPORT_SYMBOL(kfifo_free); | 79 | EXPORT_SYMBOL(__kfifo_free); |
103 | 80 | ||
104 | /** | 81 | int __kfifo_init(struct __kfifo *fifo, void *buffer, |
105 | * kfifo_skip - skip output data | 82 | unsigned int size, size_t esize) |
106 | * @fifo: the fifo to be used. | ||
107 | * @len: number of bytes to skip | ||
108 | */ | ||
109 | void kfifo_skip(struct kfifo *fifo, unsigned int len) | ||
110 | { | 83 | { |
111 | if (len < kfifo_len(fifo)) { | 84 | size /= esize; |
112 | __kfifo_add_out(fifo, len); | 85 | |
113 | return; | 86 | if (!is_power_of_2(size)) |
87 | size = rounddown_pow_of_two(size); | ||
88 | |||
89 | fifo->in = 0; | ||
90 | fifo->out = 0; | ||
91 | fifo->esize = esize; | ||
92 | fifo->data = buffer; | ||
93 | |||
94 | if (size < 2) { | ||
95 | fifo->mask = 0; | ||
96 | return -EINVAL; | ||
114 | } | 97 | } |
115 | kfifo_reset_out(fifo); | 98 | fifo->mask = size - 1; |
99 | |||
100 | return 0; | ||
116 | } | 101 | } |
117 | EXPORT_SYMBOL(kfifo_skip); | 102 | EXPORT_SYMBOL(__kfifo_init); |
118 | 103 | ||
119 | static inline void __kfifo_in_data(struct kfifo *fifo, | 104 | static void kfifo_copy_in(struct __kfifo *fifo, const void *src, |
120 | const void *from, unsigned int len, unsigned int off) | 105 | unsigned int len, unsigned int off) |
121 | { | 106 | { |
107 | unsigned int size = fifo->mask + 1; | ||
108 | unsigned int esize = fifo->esize; | ||
122 | unsigned int l; | 109 | unsigned int l; |
123 | 110 | ||
111 | off &= fifo->mask; | ||
112 | if (esize != 1) { | ||
113 | off *= esize; | ||
114 | size *= esize; | ||
115 | len *= esize; | ||
116 | } | ||
117 | l = min(len, size - off); | ||
118 | |||
119 | memcpy(fifo->data + off, src, l); | ||
120 | memcpy(fifo->data, src + l, len - l); | ||
124 | /* | 121 | /* |
125 | * Ensure that we sample the fifo->out index -before- we | 122 | * make sure that the data in the fifo is up to date before |
126 | * start putting bytes into the kfifo. | 123 | * incrementing the fifo->in index counter |
127 | */ | 124 | */ |
125 | smp_wmb(); | ||
126 | } | ||
128 | 127 | ||
129 | smp_mb(); | 128 | unsigned int __kfifo_in(struct __kfifo *fifo, |
130 | 129 | const void *buf, unsigned int len) | |
131 | off = __kfifo_off(fifo, fifo->in + off); | 130 | { |
131 | unsigned int l; | ||
132 | 132 | ||
133 | /* first put the data starting from fifo->in to buffer end */ | 133 | l = kfifo_unused(fifo); |
134 | l = min(len, fifo->size - off); | 134 | if (len > l) |
135 | memcpy(fifo->buffer + off, from, l); | 135 | len = l; |
136 | 136 | ||
137 | /* then put the rest (if any) at the beginning of the buffer */ | 137 | kfifo_copy_in(fifo, buf, len, fifo->in); |
138 | memcpy(fifo->buffer, from + l, len - l); | 138 | fifo->in += len; |
139 | return len; | ||
139 | } | 140 | } |
141 | EXPORT_SYMBOL(__kfifo_in); | ||
140 | 142 | ||
141 | static inline void __kfifo_out_data(struct kfifo *fifo, | 143 | static void kfifo_copy_out(struct __kfifo *fifo, void *dst, |
142 | void *to, unsigned int len, unsigned int off) | 144 | unsigned int len, unsigned int off) |
143 | { | 145 | { |
146 | unsigned int size = fifo->mask + 1; | ||
147 | unsigned int esize = fifo->esize; | ||
144 | unsigned int l; | 148 | unsigned int l; |
145 | 149 | ||
150 | off &= fifo->mask; | ||
151 | if (esize != 1) { | ||
152 | off *= esize; | ||
153 | size *= esize; | ||
154 | len *= esize; | ||
155 | } | ||
156 | l = min(len, size - off); | ||
157 | |||
158 | memcpy(dst, fifo->data + off, l); | ||
159 | memcpy(dst + l, fifo->data, len - l); | ||
146 | /* | 160 | /* |
147 | * Ensure that we sample the fifo->in index -before- we | 161 | * make sure that the data is copied before |
148 | * start removing bytes from the kfifo. | 162 | * incrementing the fifo->out index counter |
149 | */ | 163 | */ |
164 | smp_wmb(); | ||
165 | } | ||
150 | 166 | ||
151 | smp_rmb(); | 167 | unsigned int __kfifo_out_peek(struct __kfifo *fifo, |
168 | void *buf, unsigned int len) | ||
169 | { | ||
170 | unsigned int l; | ||
152 | 171 | ||
153 | off = __kfifo_off(fifo, fifo->out + off); | 172 | l = fifo->in - fifo->out; |
173 | if (len > l) | ||
174 | len = l; | ||
154 | 175 | ||
155 | /* first get the data from fifo->out until the end of the buffer */ | 176 | kfifo_copy_out(fifo, buf, len, fifo->out); |
156 | l = min(len, fifo->size - off); | 177 | return len; |
157 | memcpy(to, fifo->buffer + off, l); | 178 | } |
179 | EXPORT_SYMBOL(__kfifo_out_peek); | ||
158 | 180 | ||
159 | /* then get the rest (if any) from the beginning of the buffer */ | 181 | unsigned int __kfifo_out(struct __kfifo *fifo, |
160 | memcpy(to + l, fifo->buffer, len - l); | 182 | void *buf, unsigned int len) |
183 | { | ||
184 | len = __kfifo_out_peek(fifo, buf, len); | ||
185 | fifo->out += len; | ||
186 | return len; | ||
161 | } | 187 | } |
188 | EXPORT_SYMBOL(__kfifo_out); | ||
162 | 189 | ||
163 | static inline int __kfifo_from_user_data(struct kfifo *fifo, | 190 | static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, |
164 | const void __user *from, unsigned int len, unsigned int off, | 191 | const void __user *from, unsigned int len, unsigned int off, |
165 | unsigned *lenout) | 192 | unsigned int *copied) |
166 | { | 193 | { |
194 | unsigned int size = fifo->mask + 1; | ||
195 | unsigned int esize = fifo->esize; | ||
167 | unsigned int l; | 196 | unsigned int l; |
168 | int ret; | 197 | unsigned long ret; |
169 | 198 | ||
199 | off &= fifo->mask; | ||
200 | if (esize != 1) { | ||
201 | off *= esize; | ||
202 | size *= esize; | ||
203 | len *= esize; | ||
204 | } | ||
205 | l = min(len, size - off); | ||
206 | |||
207 | ret = copy_from_user(fifo->data + off, from, l); | ||
208 | if (unlikely(ret)) | ||
209 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
210 | else { | ||
211 | ret = copy_from_user(fifo->data, from + l, len - l); | ||
212 | if (unlikely(ret)) | ||
213 | ret = DIV_ROUND_UP(ret, esize); | ||
214 | } | ||
170 | /* | 215 | /* |
171 | * Ensure that we sample the fifo->out index -before- we | 216 | * make sure that the data in the fifo is up to date before |
172 | * start putting bytes into the kfifo. | 217 | * incrementing the fifo->in index counter |
173 | */ | 218 | */ |
219 | smp_wmb(); | ||
220 | *copied = len - ret; | ||
221 | /* return the number of elements which are not copied */ | ||
222 | return ret; | ||
223 | } | ||
174 | 224 | ||
175 | smp_mb(); | 225 | int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, |
226 | unsigned long len, unsigned int *copied) | ||
227 | { | ||
228 | unsigned int l; | ||
229 | unsigned long ret; | ||
230 | unsigned int esize = fifo->esize; | ||
231 | int err; | ||
176 | 232 | ||
177 | off = __kfifo_off(fifo, fifo->in + off); | 233 | if (esize != 1) |
234 | len /= esize; | ||
178 | 235 | ||
179 | /* first put the data starting from fifo->in to buffer end */ | 236 | l = kfifo_unused(fifo); |
180 | l = min(len, fifo->size - off); | 237 | if (len > l) |
181 | ret = copy_from_user(fifo->buffer + off, from, l); | 238 | len = l; |
182 | if (unlikely(ret)) { | ||
183 | *lenout = ret; | ||
184 | return -EFAULT; | ||
185 | } | ||
186 | *lenout = l; | ||
187 | 239 | ||
188 | /* then put the rest (if any) at the beginning of the buffer */ | 240 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); |
189 | ret = copy_from_user(fifo->buffer, from + l, len - l); | 241 | if (unlikely(ret)) { |
190 | *lenout += ret ? ret : len - l; | 242 | len -= ret; |
191 | return ret ? -EFAULT : 0; | 243 | err = -EFAULT; |
244 | } else | ||
245 | err = 0; | ||
246 | fifo->in += len; | ||
247 | return err; | ||
192 | } | 248 | } |
249 | EXPORT_SYMBOL(__kfifo_from_user); | ||
193 | 250 | ||
194 | static inline int __kfifo_to_user_data(struct kfifo *fifo, | 251 | static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, |
195 | void __user *to, unsigned int len, unsigned int off, unsigned *lenout) | 252 | unsigned int len, unsigned int off, unsigned int *copied) |
196 | { | 253 | { |
197 | unsigned int l; | 254 | unsigned int l; |
198 | int ret; | 255 | unsigned long ret; |
199 | 256 | unsigned int size = fifo->mask + 1; | |
257 | unsigned int esize = fifo->esize; | ||
258 | |||
259 | off &= fifo->mask; | ||
260 | if (esize != 1) { | ||
261 | off *= esize; | ||
262 | size *= esize; | ||
263 | len *= esize; | ||
264 | } | ||
265 | l = min(len, size - off); | ||
266 | |||
267 | ret = copy_to_user(to, fifo->data + off, l); | ||
268 | if (unlikely(ret)) | ||
269 | ret = DIV_ROUND_UP(ret + len - l, esize); | ||
270 | else { | ||
271 | ret = copy_to_user(to + l, fifo->data, len - l); | ||
272 | if (unlikely(ret)) | ||
273 | ret = DIV_ROUND_UP(ret, esize); | ||
274 | } | ||
200 | /* | 275 | /* |
201 | * Ensure that we sample the fifo->in index -before- we | 276 | * make sure that the data is copied before |
202 | * start removing bytes from the kfifo. | 277 | * incrementing the fifo->out index counter |
203 | */ | 278 | */ |
279 | smp_wmb(); | ||
280 | *copied = len - ret; | ||
281 | /* return the number of elements which are not copied */ | ||
282 | return ret; | ||
283 | } | ||
204 | 284 | ||
205 | smp_rmb(); | 285 | int __kfifo_to_user(struct __kfifo *fifo, void __user *to, |
286 | unsigned long len, unsigned int *copied) | ||
287 | { | ||
288 | unsigned int l; | ||
289 | unsigned long ret; | ||
290 | unsigned int esize = fifo->esize; | ||
291 | int err; | ||
206 | 292 | ||
207 | off = __kfifo_off(fifo, fifo->out + off); | 293 | if (esize != 1) |
294 | len /= esize; | ||
208 | 295 | ||
209 | /* first get the data from fifo->out until the end of the buffer */ | 296 | l = fifo->in - fifo->out; |
210 | l = min(len, fifo->size - off); | 297 | if (len > l) |
211 | ret = copy_to_user(to, fifo->buffer + off, l); | 298 | len = l; |
212 | *lenout = l; | 299 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); |
213 | if (unlikely(ret)) { | 300 | if (unlikely(ret)) { |
214 | *lenout -= ret; | 301 | len -= ret; |
215 | return -EFAULT; | 302 | err = -EFAULT; |
216 | } | 303 | } else |
304 | err = 0; | ||
305 | fifo->out += len; | ||
306 | return err; | ||
307 | } | ||
308 | EXPORT_SYMBOL(__kfifo_to_user); | ||
217 | 309 | ||
218 | /* then get the rest (if any) from the beginning of the buffer */ | 310 | static int setup_sgl_buf(struct scatterlist *sgl, void *buf, |
219 | len -= l; | 311 | int nents, unsigned int len) |
220 | ret = copy_to_user(to + l, fifo->buffer, len); | 312 | { |
221 | if (unlikely(ret)) { | 313 | int n; |
222 | *lenout += len - ret; | 314 | unsigned int l; |
223 | return -EFAULT; | 315 | unsigned int off; |
316 | struct page *page; | ||
317 | |||
318 | if (!nents) | ||
319 | return 0; | ||
320 | |||
321 | if (!len) | ||
322 | return 0; | ||
323 | |||
324 | n = 0; | ||
325 | page = virt_to_page(buf); | ||
326 | off = offset_in_page(buf); | ||
327 | l = 0; | ||
328 | |||
329 | while (len >= l + PAGE_SIZE - off) { | ||
330 | struct page *npage; | ||
331 | |||
332 | l += PAGE_SIZE; | ||
333 | buf += PAGE_SIZE; | ||
334 | npage = virt_to_page(buf); | ||
335 | if (page_to_phys(page) != page_to_phys(npage) - l) { | ||
336 | sg_set_page(sgl, page, l - off, off); | ||
337 | sgl = sg_next(sgl); | ||
338 | if (++n == nents || sgl == NULL) | ||
339 | return n; | ||
340 | page = npage; | ||
341 | len -= l - off; | ||
342 | l = off = 0; | ||
343 | } | ||
224 | } | 344 | } |
225 | *lenout += len; | 345 | sg_set_page(sgl, page, len, off); |
226 | return 0; | 346 | return n + 1; |
227 | } | 347 | } |
228 | 348 | ||
229 | unsigned int __kfifo_in_n(struct kfifo *fifo, | 349 | static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, |
230 | const void *from, unsigned int len, unsigned int recsize) | 350 | int nents, unsigned int len, unsigned int off) |
231 | { | 351 | { |
232 | if (kfifo_avail(fifo) < len + recsize) | 352 | unsigned int size = fifo->mask + 1; |
233 | return len + 1; | 353 | unsigned int esize = fifo->esize; |
354 | unsigned int l; | ||
355 | unsigned int n; | ||
234 | 356 | ||
235 | __kfifo_in_data(fifo, from, len, recsize); | 357 | off &= fifo->mask; |
236 | return 0; | 358 | if (esize != 1) { |
359 | off *= esize; | ||
360 | size *= esize; | ||
361 | len *= esize; | ||
362 | } | ||
363 | l = min(len, size - off); | ||
364 | |||
365 | n = setup_sgl_buf(sgl, fifo->data + off, nents, l); | ||
366 | n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); | ||
367 | |||
368 | return n; | ||
237 | } | 369 | } |
238 | EXPORT_SYMBOL(__kfifo_in_n); | ||
239 | 370 | ||
240 | /** | 371 | unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, |
241 | * kfifo_in - puts some data into the FIFO | 372 | struct scatterlist *sgl, int nents, unsigned int len) |
242 | * @fifo: the fifo to be used. | ||
243 | * @from: the data to be added. | ||
244 | * @len: the length of the data to be added. | ||
245 | * | ||
246 | * This function copies at most @len bytes from the @from buffer into | ||
247 | * the FIFO depending on the free space, and returns the number of | ||
248 | * bytes copied. | ||
249 | * | ||
250 | * Note that with only one concurrent reader and one concurrent | ||
251 | * writer, you don't need extra locking to use these functions. | ||
252 | */ | ||
253 | unsigned int kfifo_in(struct kfifo *fifo, const void *from, | ||
254 | unsigned int len) | ||
255 | { | 373 | { |
256 | len = min(kfifo_avail(fifo), len); | 374 | unsigned int l; |
257 | 375 | ||
258 | __kfifo_in_data(fifo, from, len, 0); | 376 | l = kfifo_unused(fifo); |
259 | __kfifo_add_in(fifo, len); | 377 | if (len > l) |
260 | return len; | 378 | len = l; |
379 | |||
380 | return setup_sgl(fifo, sgl, nents, len, fifo->in); | ||
261 | } | 381 | } |
262 | EXPORT_SYMBOL(kfifo_in); | 382 | EXPORT_SYMBOL(__kfifo_dma_in_prepare); |
263 | 383 | ||
264 | unsigned int __kfifo_in_generic(struct kfifo *fifo, | 384 | unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, |
265 | const void *from, unsigned int len, unsigned int recsize) | 385 | struct scatterlist *sgl, int nents, unsigned int len) |
266 | { | 386 | { |
267 | return __kfifo_in_rec(fifo, from, len, recsize); | 387 | unsigned int l; |
388 | |||
389 | l = fifo->in - fifo->out; | ||
390 | if (len > l) | ||
391 | len = l; | ||
392 | |||
393 | return setup_sgl(fifo, sgl, nents, len, fifo->out); | ||
268 | } | 394 | } |
269 | EXPORT_SYMBOL(__kfifo_in_generic); | 395 | EXPORT_SYMBOL(__kfifo_dma_out_prepare); |
270 | 396 | ||
271 | unsigned int __kfifo_out_n(struct kfifo *fifo, | 397 | unsigned int __kfifo_max_r(unsigned int len, size_t recsize) |
272 | void *to, unsigned int len, unsigned int recsize) | ||
273 | { | 398 | { |
274 | if (kfifo_len(fifo) < len + recsize) | 399 | unsigned int max = (1 << (recsize << 3)) - 1; |
275 | return len; | ||
276 | 400 | ||
277 | __kfifo_out_data(fifo, to, len, recsize); | 401 | if (len > max) |
278 | __kfifo_add_out(fifo, len + recsize); | 402 | return max; |
279 | return 0; | 403 | return len; |
280 | } | 404 | } |
281 | EXPORT_SYMBOL(__kfifo_out_n); | ||
282 | 405 | ||
283 | /** | 406 | #define __KFIFO_PEEK(data, out, mask) \ |
284 | * kfifo_out - gets some data from the FIFO | 407 | ((data)[(out) & (mask)]) |
285 | * @fifo: the fifo to be used. | 408 | /* |
286 | * @to: where the data must be copied. | 409 | * __kfifo_peek_n internal helper function for determinate the length of |
287 | * @len: the size of the destination buffer. | 410 | * the next record in the fifo |
288 | * | ||
289 | * This function copies at most @len bytes from the FIFO into the | ||
290 | * @to buffer and returns the number of copied bytes. | ||
291 | * | ||
292 | * Note that with only one concurrent reader and one concurrent | ||
293 | * writer, you don't need extra locking to use these functions. | ||
294 | */ | 411 | */ |
295 | unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) | 412 | static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) |
296 | { | 413 | { |
297 | len = min(kfifo_len(fifo), len); | 414 | unsigned int l; |
415 | unsigned int mask = fifo->mask; | ||
416 | unsigned char *data = fifo->data; | ||
298 | 417 | ||
299 | __kfifo_out_data(fifo, to, len, 0); | 418 | l = __KFIFO_PEEK(data, fifo->out, mask); |
300 | __kfifo_add_out(fifo, len); | ||
301 | 419 | ||
302 | return len; | 420 | if (--recsize) |
421 | l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; | ||
422 | |||
423 | return l; | ||
303 | } | 424 | } |
304 | EXPORT_SYMBOL(kfifo_out); | ||
305 | 425 | ||
306 | /** | 426 | #define __KFIFO_POKE(data, in, mask, val) \ |
307 | * kfifo_out_peek - copy some data from the FIFO, but do not remove it | 427 | ( \ |
308 | * @fifo: the fifo to be used. | 428 | (data)[(in) & (mask)] = (unsigned char)(val) \ |
309 | * @to: where the data must be copied. | 429 | ) |
310 | * @len: the size of the destination buffer. | 430 | |
311 | * @offset: offset into the fifo | 431 | /* |
312 | * | 432 | * __kfifo_poke_n internal helper function for storeing the length of |
313 | * This function copies at most @len bytes at @offset from the FIFO | 433 | * the record into the fifo |
314 | * into the @to buffer and returns the number of copied bytes. | ||
315 | * The data is not removed from the FIFO. | ||
316 | */ | 434 | */ |
317 | unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, | 435 | static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) |
318 | unsigned offset) | ||
319 | { | 436 | { |
320 | len = min(kfifo_len(fifo), len + offset); | 437 | unsigned int mask = fifo->mask; |
438 | unsigned char *data = fifo->data; | ||
321 | 439 | ||
322 | __kfifo_out_data(fifo, to, len, offset); | 440 | __KFIFO_POKE(data, fifo->in, mask, n); |
323 | return len; | 441 | |
442 | if (recsize > 1) | ||
443 | __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); | ||
324 | } | 444 | } |
325 | EXPORT_SYMBOL(kfifo_out_peek); | ||
326 | 445 | ||
327 | unsigned int __kfifo_out_generic(struct kfifo *fifo, | 446 | unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) |
328 | void *to, unsigned int len, unsigned int recsize, | ||
329 | unsigned int *total) | ||
330 | { | 447 | { |
331 | return __kfifo_out_rec(fifo, to, len, recsize, total); | 448 | return __kfifo_peek_n(fifo, recsize); |
332 | } | 449 | } |
333 | EXPORT_SYMBOL(__kfifo_out_generic); | 450 | EXPORT_SYMBOL(__kfifo_len_r); |
334 | 451 | ||
335 | unsigned int __kfifo_from_user_n(struct kfifo *fifo, | 452 | unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, |
336 | const void __user *from, unsigned int len, unsigned int recsize) | 453 | unsigned int len, size_t recsize) |
337 | { | 454 | { |
338 | unsigned total; | 455 | if (len + recsize > kfifo_unused(fifo)) |
456 | return 0; | ||
339 | 457 | ||
340 | if (kfifo_avail(fifo) < len + recsize) | 458 | __kfifo_poke_n(fifo, len, recsize); |
341 | return len + 1; | ||
342 | 459 | ||
343 | __kfifo_from_user_data(fifo, from, len, recsize, &total); | 460 | kfifo_copy_in(fifo, buf, len, fifo->in + recsize); |
344 | return total; | 461 | fifo->in += len + recsize; |
462 | return len; | ||
345 | } | 463 | } |
346 | EXPORT_SYMBOL(__kfifo_from_user_n); | 464 | EXPORT_SYMBOL(__kfifo_in_r); |
347 | 465 | ||
348 | /** | 466 | static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, |
349 | * kfifo_from_user - puts some data from user space into the FIFO | 467 | void *buf, unsigned int len, size_t recsize, unsigned int *n) |
350 | * @fifo: the fifo to be used. | 468 | { |
351 | * @from: pointer to the data to be added. | 469 | *n = __kfifo_peek_n(fifo, recsize); |
352 | * @len: the length of the data to be added. | 470 | |
353 | * @total: the actual returned data length. | 471 | if (len > *n) |
354 | * | 472 | len = *n; |
355 | * This function copies at most @len bytes from the @from into the | 473 | |
356 | * FIFO depending and returns -EFAULT/0. | 474 | kfifo_copy_out(fifo, buf, len, fifo->out + recsize); |
357 | * | 475 | return len; |
358 | * Note that with only one concurrent reader and one concurrent | ||
359 | * writer, you don't need extra locking to use these functions. | ||
360 | */ | ||
361 | int kfifo_from_user(struct kfifo *fifo, | ||
362 | const void __user *from, unsigned int len, unsigned *total) | ||
363 | { | ||
364 | int ret; | ||
365 | len = min(kfifo_avail(fifo), len); | ||
366 | ret = __kfifo_from_user_data(fifo, from, len, 0, total); | ||
367 | if (ret) | ||
368 | return ret; | ||
369 | __kfifo_add_in(fifo, len); | ||
370 | return 0; | ||
371 | } | 476 | } |
372 | EXPORT_SYMBOL(kfifo_from_user); | ||
373 | 477 | ||
374 | unsigned int __kfifo_from_user_generic(struct kfifo *fifo, | 478 | unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, |
375 | const void __user *from, unsigned int len, unsigned int recsize) | 479 | unsigned int len, size_t recsize) |
376 | { | 480 | { |
377 | return __kfifo_from_user_rec(fifo, from, len, recsize); | 481 | unsigned int n; |
482 | |||
483 | if (fifo->in == fifo->out) | ||
484 | return 0; | ||
485 | |||
486 | return kfifo_out_copy_r(fifo, buf, len, recsize, &n); | ||
378 | } | 487 | } |
379 | EXPORT_SYMBOL(__kfifo_from_user_generic); | 488 | EXPORT_SYMBOL(__kfifo_out_peek_r); |
380 | 489 | ||
381 | unsigned int __kfifo_to_user_n(struct kfifo *fifo, | 490 | unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, |
382 | void __user *to, unsigned int len, unsigned int reclen, | 491 | unsigned int len, size_t recsize) |
383 | unsigned int recsize) | ||
384 | { | 492 | { |
385 | unsigned int ret, total; | 493 | unsigned int n; |
386 | 494 | ||
387 | if (kfifo_len(fifo) < reclen + recsize) | 495 | if (fifo->in == fifo->out) |
388 | return len; | 496 | return 0; |
389 | 497 | ||
390 | ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); | 498 | len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); |
499 | fifo->out += n + recsize; | ||
500 | return len; | ||
501 | } | ||
502 | EXPORT_SYMBOL(__kfifo_out_r); | ||
391 | 503 | ||
392 | if (likely(ret == 0)) | 504 | void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) |
393 | __kfifo_add_out(fifo, reclen + recsize); | 505 | { |
506 | unsigned int n; | ||
394 | 507 | ||
395 | return total; | 508 | n = __kfifo_peek_n(fifo, recsize); |
509 | fifo->out += n + recsize; | ||
396 | } | 510 | } |
397 | EXPORT_SYMBOL(__kfifo_to_user_n); | 511 | EXPORT_SYMBOL(__kfifo_skip_r); |
398 | 512 | ||
399 | /** | 513 | int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, |
400 | * kfifo_to_user - gets data from the FIFO and write it to user space | 514 | unsigned long len, unsigned int *copied, size_t recsize) |
401 | * @fifo: the fifo to be used. | ||
402 | * @to: where the data must be copied. | ||
403 | * @len: the size of the destination buffer. | ||
404 | * @lenout: pointer to output variable with copied data | ||
405 | * | ||
406 | * This function copies at most @len bytes from the FIFO into the | ||
407 | * @to buffer and 0 or -EFAULT. | ||
408 | * | ||
409 | * Note that with only one concurrent reader and one concurrent | ||
410 | * writer, you don't need extra locking to use these functions. | ||
411 | */ | ||
412 | int kfifo_to_user(struct kfifo *fifo, | ||
413 | void __user *to, unsigned int len, unsigned *lenout) | ||
414 | { | 515 | { |
415 | int ret; | 516 | unsigned long ret; |
416 | len = min(kfifo_len(fifo), len); | 517 | |
417 | ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); | 518 | len = __kfifo_max_r(len, recsize); |
418 | __kfifo_add_out(fifo, *lenout); | 519 | |
419 | return ret; | 520 | if (len + recsize > kfifo_unused(fifo)) { |
521 | *copied = 0; | ||
522 | return 0; | ||
523 | } | ||
524 | |||
525 | __kfifo_poke_n(fifo, len, recsize); | ||
526 | |||
527 | ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); | ||
528 | if (unlikely(ret)) { | ||
529 | *copied = 0; | ||
530 | return -EFAULT; | ||
531 | } | ||
532 | fifo->in += len + recsize; | ||
533 | return 0; | ||
420 | } | 534 | } |
421 | EXPORT_SYMBOL(kfifo_to_user); | 535 | EXPORT_SYMBOL(__kfifo_from_user_r); |
422 | 536 | ||
423 | unsigned int __kfifo_to_user_generic(struct kfifo *fifo, | 537 | int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, |
424 | void __user *to, unsigned int len, unsigned int recsize, | 538 | unsigned long len, unsigned int *copied, size_t recsize) |
425 | unsigned int *total) | ||
426 | { | 539 | { |
427 | return __kfifo_to_user_rec(fifo, to, len, recsize, total); | 540 | unsigned long ret; |
541 | unsigned int n; | ||
542 | |||
543 | if (fifo->in == fifo->out) { | ||
544 | *copied = 0; | ||
545 | return 0; | ||
546 | } | ||
547 | |||
548 | n = __kfifo_peek_n(fifo, recsize); | ||
549 | if (len > n) | ||
550 | len = n; | ||
551 | |||
552 | ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); | ||
553 | if (unlikely(ret)) { | ||
554 | *copied = 0; | ||
555 | return -EFAULT; | ||
556 | } | ||
557 | fifo->out += n + recsize; | ||
558 | return 0; | ||
428 | } | 559 | } |
429 | EXPORT_SYMBOL(__kfifo_to_user_generic); | 560 | EXPORT_SYMBOL(__kfifo_to_user_r); |
430 | 561 | ||
431 | unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) | 562 | unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, |
563 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
432 | { | 564 | { |
433 | if (recsize == 0) | 565 | if (!nents) |
434 | return kfifo_avail(fifo); | 566 | BUG(); |
435 | 567 | ||
436 | return __kfifo_peek_n(fifo, recsize); | 568 | len = __kfifo_max_r(len, recsize); |
569 | |||
570 | if (len + recsize > kfifo_unused(fifo)) | ||
571 | return 0; | ||
572 | |||
573 | return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); | ||
437 | } | 574 | } |
438 | EXPORT_SYMBOL(__kfifo_peek_generic); | 575 | EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); |
439 | 576 | ||
440 | void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) | 577 | void __kfifo_dma_in_finish_r(struct __kfifo *fifo, |
578 | unsigned int len, size_t recsize) | ||
441 | { | 579 | { |
442 | __kfifo_skip_rec(fifo, recsize); | 580 | len = __kfifo_max_r(len, recsize); |
581 | __kfifo_poke_n(fifo, len, recsize); | ||
582 | fifo->in += len + recsize; | ||
443 | } | 583 | } |
444 | EXPORT_SYMBOL(__kfifo_skip_generic); | 584 | EXPORT_SYMBOL(__kfifo_dma_in_finish_r); |
445 | 585 | ||
586 | unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, | ||
587 | struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) | ||
588 | { | ||
589 | if (!nents) | ||
590 | BUG(); | ||
591 | |||
592 | len = __kfifo_max_r(len, recsize); | ||
593 | |||
594 | if (len + recsize > fifo->in - fifo->out) | ||
595 | return 0; | ||
596 | |||
597 | return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); | ||
598 | } | ||
599 | EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); | ||
600 | |||
601 | void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) | ||
602 | { | ||
603 | unsigned int len; | ||
604 | |||
605 | len = __kfifo_peek_n(fifo, recsize); | ||
606 | fifo->out += len + recsize; | ||
607 | } | ||
608 | EXPORT_SYMBOL(__kfifo_dma_out_finish_r); | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 6e9b19667a8d..9cd0591c96a2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data) | |||
153 | goto fail; | 153 | goto fail; |
154 | } | 154 | } |
155 | 155 | ||
156 | retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); | 156 | retval = kernel_execve(sub_info->path, |
157 | (const char *const *)sub_info->argv, | ||
158 | (const char *const *)sub_info->envp); | ||
157 | 159 | ||
158 | /* Exec failed? */ | 160 | /* Exec failed? */ |
159 | fail: | 161 | fail: |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c780175..2dc3786349d1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -14,6 +14,8 @@ | |||
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/slab.h> | ||
18 | #include <linux/freezer.h> | ||
17 | #include <trace/events/sched.h> | 19 | #include <trace/events/sched.h> |
18 | 20 | ||
19 | static DEFINE_SPINLOCK(kthread_create_lock); | 21 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -35,6 +37,7 @@ struct kthread_create_info | |||
35 | 37 | ||
36 | struct kthread { | 38 | struct kthread { |
37 | int should_stop; | 39 | int should_stop; |
40 | void *data; | ||
38 | struct completion exited; | 41 | struct completion exited; |
39 | }; | 42 | }; |
40 | 43 | ||
@@ -54,6 +57,19 @@ int kthread_should_stop(void) | |||
54 | } | 57 | } |
55 | EXPORT_SYMBOL(kthread_should_stop); | 58 | EXPORT_SYMBOL(kthread_should_stop); |
56 | 59 | ||
60 | /** | ||
61 | * kthread_data - return data value specified on kthread creation | ||
62 | * @task: kthread task in question | ||
63 | * | ||
64 | * Return the data value specified when kthread @task was created. | ||
65 | * The caller is responsible for ensuring the validity of @task when | ||
66 | * calling this function. | ||
67 | */ | ||
68 | void *kthread_data(struct task_struct *task) | ||
69 | { | ||
70 | return to_kthread(task)->data; | ||
71 | } | ||
72 | |||
57 | static int kthread(void *_create) | 73 | static int kthread(void *_create) |
58 | { | 74 | { |
59 | /* Copy data: it's on kthread's stack */ | 75 | /* Copy data: it's on kthread's stack */ |
@@ -64,6 +80,7 @@ static int kthread(void *_create) | |||
64 | int ret; | 80 | int ret; |
65 | 81 | ||
66 | self.should_stop = 0; | 82 | self.should_stop = 0; |
83 | self.data = data; | ||
67 | init_completion(&self.exited); | 84 | init_completion(&self.exited); |
68 | current->vfork_done = &self.exited; | 85 | current->vfork_done = &self.exited; |
69 | 86 | ||
@@ -247,3 +264,150 @@ int kthreadd(void *unused) | |||
247 | 264 | ||
248 | return 0; | 265 | return 0; |
249 | } | 266 | } |
267 | |||
268 | /** | ||
269 | * kthread_worker_fn - kthread function to process kthread_worker | ||
270 | * @worker_ptr: pointer to initialized kthread_worker | ||
271 | * | ||
272 | * This function can be used as @threadfn to kthread_create() or | ||
273 | * kthread_run() with @worker_ptr argument pointing to an initialized | ||
274 | * kthread_worker. The started kthread will process work_list until | ||
275 | * the it is stopped with kthread_stop(). A kthread can also call | ||
276 | * this function directly after extra initialization. | ||
277 | * | ||
278 | * Different kthreads can be used for the same kthread_worker as long | ||
279 | * as there's only one kthread attached to it at any given time. A | ||
280 | * kthread_worker without an attached kthread simply collects queued | ||
281 | * kthread_works. | ||
282 | */ | ||
283 | int kthread_worker_fn(void *worker_ptr) | ||
284 | { | ||
285 | struct kthread_worker *worker = worker_ptr; | ||
286 | struct kthread_work *work; | ||
287 | |||
288 | WARN_ON(worker->task); | ||
289 | worker->task = current; | ||
290 | repeat: | ||
291 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | ||
292 | |||
293 | if (kthread_should_stop()) { | ||
294 | __set_current_state(TASK_RUNNING); | ||
295 | spin_lock_irq(&worker->lock); | ||
296 | worker->task = NULL; | ||
297 | spin_unlock_irq(&worker->lock); | ||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | work = NULL; | ||
302 | spin_lock_irq(&worker->lock); | ||
303 | if (!list_empty(&worker->work_list)) { | ||
304 | work = list_first_entry(&worker->work_list, | ||
305 | struct kthread_work, node); | ||
306 | list_del_init(&work->node); | ||
307 | } | ||
308 | spin_unlock_irq(&worker->lock); | ||
309 | |||
310 | if (work) { | ||
311 | __set_current_state(TASK_RUNNING); | ||
312 | work->func(work); | ||
313 | smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ | ||
314 | work->done_seq = work->queue_seq; | ||
315 | smp_mb(); /* mb worker-b1 paired with flush-b0 */ | ||
316 | if (atomic_read(&work->flushing)) | ||
317 | wake_up_all(&work->done); | ||
318 | } else if (!freezing(current)) | ||
319 | schedule(); | ||
320 | |||
321 | try_to_freeze(); | ||
322 | goto repeat; | ||
323 | } | ||
324 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | ||
325 | |||
326 | /** | ||
327 | * queue_kthread_work - queue a kthread_work | ||
328 | * @worker: target kthread_worker | ||
329 | * @work: kthread_work to queue | ||
330 | * | ||
331 | * Queue @work to work processor @task for async execution. @task | ||
332 | * must have been created with kthread_worker_create(). Returns %true | ||
333 | * if @work was successfully queued, %false if it was already pending. | ||
334 | */ | ||
335 | bool queue_kthread_work(struct kthread_worker *worker, | ||
336 | struct kthread_work *work) | ||
337 | { | ||
338 | bool ret = false; | ||
339 | unsigned long flags; | ||
340 | |||
341 | spin_lock_irqsave(&worker->lock, flags); | ||
342 | if (list_empty(&work->node)) { | ||
343 | list_add_tail(&work->node, &worker->work_list); | ||
344 | work->queue_seq++; | ||
345 | if (likely(worker->task)) | ||
346 | wake_up_process(worker->task); | ||
347 | ret = true; | ||
348 | } | ||
349 | spin_unlock_irqrestore(&worker->lock, flags); | ||
350 | return ret; | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(queue_kthread_work); | ||
353 | |||
354 | /** | ||
355 | * flush_kthread_work - flush a kthread_work | ||
356 | * @work: work to flush | ||
357 | * | ||
358 | * If @work is queued or executing, wait for it to finish execution. | ||
359 | */ | ||
360 | void flush_kthread_work(struct kthread_work *work) | ||
361 | { | ||
362 | int seq = work->queue_seq; | ||
363 | |||
364 | atomic_inc(&work->flushing); | ||
365 | |||
366 | /* | ||
367 | * mb flush-b0 paired with worker-b1, to make sure either | ||
368 | * worker sees the above increment or we see done_seq update. | ||
369 | */ | ||
370 | smp_mb__after_atomic_inc(); | ||
371 | |||
372 | /* A - B <= 0 tests whether B is in front of A regardless of overflow */ | ||
373 | wait_event(work->done, seq - work->done_seq <= 0); | ||
374 | atomic_dec(&work->flushing); | ||
375 | |||
376 | /* | ||
377 | * rmb flush-b1 paired with worker-b0, to make sure our caller | ||
378 | * sees every change made by work->func(). | ||
379 | */ | ||
380 | smp_mb__after_atomic_dec(); | ||
381 | } | ||
382 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
383 | |||
384 | struct kthread_flush_work { | ||
385 | struct kthread_work work; | ||
386 | struct completion done; | ||
387 | }; | ||
388 | |||
389 | static void kthread_flush_work_fn(struct kthread_work *work) | ||
390 | { | ||
391 | struct kthread_flush_work *fwork = | ||
392 | container_of(work, struct kthread_flush_work, work); | ||
393 | complete(&fwork->done); | ||
394 | } | ||
395 | |||
396 | /** | ||
397 | * flush_kthread_worker - flush all current works on a kthread_worker | ||
398 | * @worker: worker to flush | ||
399 | * | ||
400 | * Wait until all currently executing or pending works on @worker are | ||
401 | * finished. | ||
402 | */ | ||
403 | void flush_kthread_worker(struct kthread_worker *worker) | ||
404 | { | ||
405 | struct kthread_flush_work fwork = { | ||
406 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | ||
407 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), | ||
408 | }; | ||
409 | |||
410 | queue_kthread_work(worker, &fwork.work); | ||
411 | wait_for_completion(&fwork.done); | ||
412 | } | ||
413 | EXPORT_SYMBOL_GPL(flush_kthread_worker); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37b..f2852a510232 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], | |||
146 | 146 | ||
147 | static inline u64 lockstat_clock(void) | 147 | static inline u64 lockstat_clock(void) |
148 | { | 148 | { |
149 | return cpu_clock(smp_processor_id()); | 149 | return local_clock(); |
150 | } | 150 | } |
151 | 151 | ||
152 | static int lock_point(unsigned long points[], unsigned long ip) | 152 | static int lock_point(unsigned long points[], unsigned long ip) |
diff --git a/kernel/module.c b/kernel/module.c index 6c562828c85c..ccd641991842 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | Copyright (C) 2002 Richard Henderson | 2 | Copyright (C) 2002 Richard Henderson |
3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. |
4 | 4 | ||
5 | This program is free software; you can redistribute it and/or modify | 5 | This program is free software; you can redistribute it and/or modify |
6 | it under the terms of the GNU General Public License as published by | 6 | it under the terms of the GNU General Public License as published by |
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb) | |||
110 | } | 110 | } |
111 | EXPORT_SYMBOL(unregister_module_notifier); | 111 | EXPORT_SYMBOL(unregister_module_notifier); |
112 | 112 | ||
113 | struct load_info { | ||
114 | Elf_Ehdr *hdr; | ||
115 | unsigned long len; | ||
116 | Elf_Shdr *sechdrs; | ||
117 | char *secstrings, *strtab; | ||
118 | unsigned long *strmap; | ||
119 | unsigned long symoffs, stroffs; | ||
120 | struct _ddebug *debug; | ||
121 | unsigned int num_debug; | ||
122 | struct { | ||
123 | unsigned int sym, str, mod, vers, info, pcpu; | ||
124 | } index; | ||
125 | }; | ||
126 | |||
113 | /* We require a truly strong try_module_get(): 0 means failure due to | 127 | /* We require a truly strong try_module_get(): 0 means failure due to |
114 | ongoing or failed initialization etc. */ | 128 | ongoing or failed initialization etc. */ |
115 | static inline int strong_try_module_get(struct module *mod) | 129 | static inline int strong_try_module_get(struct module *mod) |
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code) | |||
140 | EXPORT_SYMBOL(__module_put_and_exit); | 154 | EXPORT_SYMBOL(__module_put_and_exit); |
141 | 155 | ||
142 | /* Find a module section: 0 means not found. */ | 156 | /* Find a module section: 0 means not found. */ |
143 | static unsigned int find_sec(Elf_Ehdr *hdr, | 157 | static unsigned int find_sec(const struct load_info *info, const char *name) |
144 | Elf_Shdr *sechdrs, | ||
145 | const char *secstrings, | ||
146 | const char *name) | ||
147 | { | 158 | { |
148 | unsigned int i; | 159 | unsigned int i; |
149 | 160 | ||
150 | for (i = 1; i < hdr->e_shnum; i++) | 161 | for (i = 1; i < info->hdr->e_shnum; i++) { |
162 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
151 | /* Alloc bit cleared means "ignore it." */ | 163 | /* Alloc bit cleared means "ignore it." */ |
152 | if ((sechdrs[i].sh_flags & SHF_ALLOC) | 164 | if ((shdr->sh_flags & SHF_ALLOC) |
153 | && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) | 165 | && strcmp(info->secstrings + shdr->sh_name, name) == 0) |
154 | return i; | 166 | return i; |
167 | } | ||
155 | return 0; | 168 | return 0; |
156 | } | 169 | } |
157 | 170 | ||
158 | /* Find a module section, or NULL. */ | 171 | /* Find a module section, or NULL. */ |
159 | static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, | 172 | static void *section_addr(const struct load_info *info, const char *name) |
160 | const char *secstrings, const char *name) | ||
161 | { | 173 | { |
162 | /* Section 0 has sh_addr 0. */ | 174 | /* Section 0 has sh_addr 0. */ |
163 | return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; | 175 | return (void *)info->sechdrs[find_sec(info, name)].sh_addr; |
164 | } | 176 | } |
165 | 177 | ||
166 | /* Find a module section, or NULL. Fill in number of "objects" in section. */ | 178 | /* Find a module section, or NULL. Fill in number of "objects" in section. */ |
167 | static void *section_objs(Elf_Ehdr *hdr, | 179 | static void *section_objs(const struct load_info *info, |
168 | Elf_Shdr *sechdrs, | ||
169 | const char *secstrings, | ||
170 | const char *name, | 180 | const char *name, |
171 | size_t object_size, | 181 | size_t object_size, |
172 | unsigned int *num) | 182 | unsigned int *num) |
173 | { | 183 | { |
174 | unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); | 184 | unsigned int sec = find_sec(info, name); |
175 | 185 | ||
176 | /* Section 0 has sh_addr 0 and sh_size 0. */ | 186 | /* Section 0 has sh_addr 0 and sh_size 0. */ |
177 | *num = sechdrs[sec].sh_size / object_size; | 187 | *num = info->sechdrs[sec].sh_size / object_size; |
178 | return (void *)sechdrs[sec].sh_addr; | 188 | return (void *)info->sechdrs[sec].sh_addr; |
179 | } | 189 | } |
180 | 190 | ||
181 | /* Provided by the linker */ | 191 | /* Provided by the linker */ |
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
227 | unsigned int symnum, void *data), void *data) | 237 | unsigned int symnum, void *data), void *data) |
228 | { | 238 | { |
229 | struct module *mod; | 239 | struct module *mod; |
230 | const struct symsearch arr[] = { | 240 | static const struct symsearch arr[] = { |
231 | { __start___ksymtab, __stop___ksymtab, __start___kcrctab, | 241 | { __start___ksymtab, __stop___ksymtab, __start___kcrctab, |
232 | NOT_GPL_ONLY, false }, | 242 | NOT_GPL_ONLY, false }, |
233 | { __start___ksymtab_gpl, __stop___ksymtab_gpl, | 243 | { __start___ksymtab_gpl, __stop___ksymtab_gpl, |
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod, | |||
392 | mod->percpu = __alloc_reserved_percpu(size, align); | 402 | mod->percpu = __alloc_reserved_percpu(size, align); |
393 | if (!mod->percpu) { | 403 | if (!mod->percpu) { |
394 | printk(KERN_WARNING | 404 | printk(KERN_WARNING |
395 | "Could not allocate %lu bytes percpu data\n", size); | 405 | "%s: Could not allocate %lu bytes percpu data\n", |
406 | mod->name, size); | ||
396 | return -ENOMEM; | 407 | return -ENOMEM; |
397 | } | 408 | } |
398 | mod->percpu_size = size; | 409 | mod->percpu_size = size; |
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod) | |||
404 | free_percpu(mod->percpu); | 415 | free_percpu(mod->percpu); |
405 | } | 416 | } |
406 | 417 | ||
407 | static unsigned int find_pcpusec(Elf_Ehdr *hdr, | 418 | static unsigned int find_pcpusec(struct load_info *info) |
408 | Elf_Shdr *sechdrs, | ||
409 | const char *secstrings) | ||
410 | { | 419 | { |
411 | return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); | 420 | return find_sec(info, ".data..percpu"); |
412 | } | 421 | } |
413 | 422 | ||
414 | static void percpu_modcopy(struct module *mod, | 423 | static void percpu_modcopy(struct module *mod, |
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod, | |||
468 | static inline void percpu_modfree(struct module *mod) | 477 | static inline void percpu_modfree(struct module *mod) |
469 | { | 478 | { |
470 | } | 479 | } |
471 | static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, | 480 | static unsigned int find_pcpusec(struct load_info *info) |
472 | Elf_Shdr *sechdrs, | ||
473 | const char *secstrings) | ||
474 | { | 481 | { |
475 | return 0; | 482 | return 0; |
476 | } | 483 | } |
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; | |||
524 | EXPORT_TRACEPOINT_SYMBOL(module_get); | 531 | EXPORT_TRACEPOINT_SYMBOL(module_get); |
525 | 532 | ||
526 | /* Init the unload section of the module. */ | 533 | /* Init the unload section of the module. */ |
527 | static void module_unload_init(struct module *mod) | 534 | static int module_unload_init(struct module *mod) |
528 | { | 535 | { |
529 | int cpu; | 536 | mod->refptr = alloc_percpu(struct module_ref); |
537 | if (!mod->refptr) | ||
538 | return -ENOMEM; | ||
530 | 539 | ||
531 | INIT_LIST_HEAD(&mod->source_list); | 540 | INIT_LIST_HEAD(&mod->source_list); |
532 | INIT_LIST_HEAD(&mod->target_list); | 541 | INIT_LIST_HEAD(&mod->target_list); |
533 | for_each_possible_cpu(cpu) { | ||
534 | per_cpu_ptr(mod->refptr, cpu)->incs = 0; | ||
535 | per_cpu_ptr(mod->refptr, cpu)->decs = 0; | ||
536 | } | ||
537 | 542 | ||
538 | /* Hold reference count during initialization. */ | 543 | /* Hold reference count during initialization. */ |
539 | __this_cpu_write(mod->refptr->incs, 1); | 544 | __this_cpu_write(mod->refptr->incs, 1); |
540 | /* Backwards compatibility macros put refcount during init. */ | 545 | /* Backwards compatibility macros put refcount during init. */ |
541 | mod->waiter = current; | 546 | mod->waiter = current; |
547 | |||
548 | return 0; | ||
542 | } | 549 | } |
543 | 550 | ||
544 | /* Does a already use b? */ | 551 | /* Does a already use b? */ |
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod) | |||
618 | kfree(use); | 625 | kfree(use); |
619 | } | 626 | } |
620 | mutex_unlock(&module_mutex); | 627 | mutex_unlock(&module_mutex); |
628 | |||
629 | free_percpu(mod->refptr); | ||
621 | } | 630 | } |
622 | 631 | ||
623 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 632 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
@@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b) | |||
891 | } | 900 | } |
892 | EXPORT_SYMBOL_GPL(ref_module); | 901 | EXPORT_SYMBOL_GPL(ref_module); |
893 | 902 | ||
894 | static inline void module_unload_init(struct module *mod) | 903 | static inline int module_unload_init(struct module *mod) |
895 | { | 904 | { |
905 | return 0; | ||
896 | } | 906 | } |
897 | #endif /* CONFIG_MODULE_UNLOAD */ | 907 | #endif /* CONFIG_MODULE_UNLOAD */ |
898 | 908 | ||
@@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
1051 | #endif /* CONFIG_MODVERSIONS */ | 1061 | #endif /* CONFIG_MODVERSIONS */ |
1052 | 1062 | ||
1053 | /* Resolve a symbol for this module. I.e. if we find one, record usage. */ | 1063 | /* Resolve a symbol for this module. I.e. if we find one, record usage. */ |
1054 | static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, | 1064 | static const struct kernel_symbol *resolve_symbol(struct module *mod, |
1055 | unsigned int versindex, | 1065 | const struct load_info *info, |
1056 | const char *name, | 1066 | const char *name, |
1057 | struct module *mod, | ||
1058 | char ownername[]) | 1067 | char ownername[]) |
1059 | { | 1068 | { |
1060 | struct module *owner; | 1069 | struct module *owner; |
@@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, | |||
1068 | if (!sym) | 1077 | if (!sym) |
1069 | goto unlock; | 1078 | goto unlock; |
1070 | 1079 | ||
1071 | if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { | 1080 | if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, |
1081 | owner)) { | ||
1072 | sym = ERR_PTR(-EINVAL); | 1082 | sym = ERR_PTR(-EINVAL); |
1073 | goto getname; | 1083 | goto getname; |
1074 | } | 1084 | } |
@@ -1087,21 +1097,20 @@ unlock: | |||
1087 | return sym; | 1097 | return sym; |
1088 | } | 1098 | } |
1089 | 1099 | ||
1090 | static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, | 1100 | static const struct kernel_symbol * |
1091 | unsigned int versindex, | 1101 | resolve_symbol_wait(struct module *mod, |
1092 | const char *name, | 1102 | const struct load_info *info, |
1093 | struct module *mod) | 1103 | const char *name) |
1094 | { | 1104 | { |
1095 | const struct kernel_symbol *ksym; | 1105 | const struct kernel_symbol *ksym; |
1096 | char ownername[MODULE_NAME_LEN]; | 1106 | char owner[MODULE_NAME_LEN]; |
1097 | 1107 | ||
1098 | if (wait_event_interruptible_timeout(module_wq, | 1108 | if (wait_event_interruptible_timeout(module_wq, |
1099 | !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, | 1109 | !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) |
1100 | mod, ownername)) || | 1110 | || PTR_ERR(ksym) != -EBUSY, |
1101 | PTR_ERR(ksym) != -EBUSY, | ||
1102 | 30 * HZ) <= 0) { | 1111 | 30 * HZ) <= 0) { |
1103 | printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", | 1112 | printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", |
1104 | mod->name, ownername); | 1113 | mod->name, owner); |
1105 | } | 1114 | } |
1106 | return ksym; | 1115 | return ksym; |
1107 | } | 1116 | } |
@@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, | |||
1110 | * /sys/module/foo/sections stuff | 1119 | * /sys/module/foo/sections stuff |
1111 | * J. Corbet <corbet@lwn.net> | 1120 | * J. Corbet <corbet@lwn.net> |
1112 | */ | 1121 | */ |
1113 | #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) | 1122 | #ifdef CONFIG_SYSFS |
1114 | 1123 | ||
1124 | #ifdef CONFIG_KALLSYMS | ||
1115 | static inline bool sect_empty(const Elf_Shdr *sect) | 1125 | static inline bool sect_empty(const Elf_Shdr *sect) |
1116 | { | 1126 | { |
1117 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; | 1127 | return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; |
@@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | |||
1148 | kfree(sect_attrs); | 1158 | kfree(sect_attrs); |
1149 | } | 1159 | } |
1150 | 1160 | ||
1151 | static void add_sect_attrs(struct module *mod, unsigned int nsect, | 1161 | static void add_sect_attrs(struct module *mod, const struct load_info *info) |
1152 | char *secstrings, Elf_Shdr *sechdrs) | ||
1153 | { | 1162 | { |
1154 | unsigned int nloaded = 0, i, size[2]; | 1163 | unsigned int nloaded = 0, i, size[2]; |
1155 | struct module_sect_attrs *sect_attrs; | 1164 | struct module_sect_attrs *sect_attrs; |
@@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1157 | struct attribute **gattr; | 1166 | struct attribute **gattr; |
1158 | 1167 | ||
1159 | /* Count loaded sections and allocate structures */ | 1168 | /* Count loaded sections and allocate structures */ |
1160 | for (i = 0; i < nsect; i++) | 1169 | for (i = 0; i < info->hdr->e_shnum; i++) |
1161 | if (!sect_empty(&sechdrs[i])) | 1170 | if (!sect_empty(&info->sechdrs[i])) |
1162 | nloaded++; | 1171 | nloaded++; |
1163 | size[0] = ALIGN(sizeof(*sect_attrs) | 1172 | size[0] = ALIGN(sizeof(*sect_attrs) |
1164 | + nloaded * sizeof(sect_attrs->attrs[0]), | 1173 | + nloaded * sizeof(sect_attrs->attrs[0]), |
@@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
1175 | sect_attrs->nsections = 0; | 1184 | sect_attrs->nsections = 0; |
1176 | sattr = §_attrs->attrs[0]; | 1185 | sattr = §_attrs->attrs[0]; |
1177 | gattr = §_attrs->grp.attrs[0]; | 1186 | gattr = §_attrs->grp.attrs[0]; |
1178 | for (i = 0; i < nsect; i++) { | 1187 | for (i = 0; i < info->hdr->e_shnum; i++) { |
1179 | if (sect_empty(&sechdrs[i])) | 1188 | Elf_Shdr *sec = &info->sechdrs[i]; |
1189 | if (sect_empty(sec)) | ||
1180 | continue; | 1190 | continue; |
1181 | sattr->address = sechdrs[i].sh_addr; | 1191 | sattr->address = sec->sh_addr; |
1182 | sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, | 1192 | sattr->name = kstrdup(info->secstrings + sec->sh_name, |
1183 | GFP_KERNEL); | 1193 | GFP_KERNEL); |
1184 | if (sattr->name == NULL) | 1194 | if (sattr->name == NULL) |
1185 | goto out; | 1195 | goto out; |
@@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, | |||
1247 | kfree(notes_attrs); | 1257 | kfree(notes_attrs); |
1248 | } | 1258 | } |
1249 | 1259 | ||
1250 | static void add_notes_attrs(struct module *mod, unsigned int nsect, | 1260 | static void add_notes_attrs(struct module *mod, const struct load_info *info) |
1251 | char *secstrings, Elf_Shdr *sechdrs) | ||
1252 | { | 1261 | { |
1253 | unsigned int notes, loaded, i; | 1262 | unsigned int notes, loaded, i; |
1254 | struct module_notes_attrs *notes_attrs; | 1263 | struct module_notes_attrs *notes_attrs; |
@@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1260 | 1269 | ||
1261 | /* Count notes sections and allocate structures. */ | 1270 | /* Count notes sections and allocate structures. */ |
1262 | notes = 0; | 1271 | notes = 0; |
1263 | for (i = 0; i < nsect; i++) | 1272 | for (i = 0; i < info->hdr->e_shnum; i++) |
1264 | if (!sect_empty(&sechdrs[i]) && | 1273 | if (!sect_empty(&info->sechdrs[i]) && |
1265 | (sechdrs[i].sh_type == SHT_NOTE)) | 1274 | (info->sechdrs[i].sh_type == SHT_NOTE)) |
1266 | ++notes; | 1275 | ++notes; |
1267 | 1276 | ||
1268 | if (notes == 0) | 1277 | if (notes == 0) |
@@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1276 | 1285 | ||
1277 | notes_attrs->notes = notes; | 1286 | notes_attrs->notes = notes; |
1278 | nattr = ¬es_attrs->attrs[0]; | 1287 | nattr = ¬es_attrs->attrs[0]; |
1279 | for (loaded = i = 0; i < nsect; ++i) { | 1288 | for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { |
1280 | if (sect_empty(&sechdrs[i])) | 1289 | if (sect_empty(&info->sechdrs[i])) |
1281 | continue; | 1290 | continue; |
1282 | if (sechdrs[i].sh_type == SHT_NOTE) { | 1291 | if (info->sechdrs[i].sh_type == SHT_NOTE) { |
1283 | sysfs_bin_attr_init(nattr); | 1292 | sysfs_bin_attr_init(nattr); |
1284 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; | 1293 | nattr->attr.name = mod->sect_attrs->attrs[loaded].name; |
1285 | nattr->attr.mode = S_IRUGO; | 1294 | nattr->attr.mode = S_IRUGO; |
1286 | nattr->size = sechdrs[i].sh_size; | 1295 | nattr->size = info->sechdrs[i].sh_size; |
1287 | nattr->private = (void *) sechdrs[i].sh_addr; | 1296 | nattr->private = (void *) info->sechdrs[i].sh_addr; |
1288 | nattr->read = module_notes_read; | 1297 | nattr->read = module_notes_read; |
1289 | ++nattr; | 1298 | ++nattr; |
1290 | } | 1299 | } |
@@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod) | |||
1315 | 1324 | ||
1316 | #else | 1325 | #else |
1317 | 1326 | ||
1318 | static inline void add_sect_attrs(struct module *mod, unsigned int nsect, | 1327 | static inline void add_sect_attrs(struct module *mod, |
1319 | char *sectstrings, Elf_Shdr *sechdrs) | 1328 | const struct load_info *info) |
1320 | { | 1329 | { |
1321 | } | 1330 | } |
1322 | 1331 | ||
@@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod) | |||
1324 | { | 1333 | { |
1325 | } | 1334 | } |
1326 | 1335 | ||
1327 | static inline void add_notes_attrs(struct module *mod, unsigned int nsect, | 1336 | static inline void add_notes_attrs(struct module *mod, |
1328 | char *sectstrings, Elf_Shdr *sechdrs) | 1337 | const struct load_info *info) |
1329 | { | 1338 | { |
1330 | } | 1339 | } |
1331 | 1340 | ||
1332 | static inline void remove_notes_attrs(struct module *mod) | 1341 | static inline void remove_notes_attrs(struct module *mod) |
1333 | { | 1342 | { |
1334 | } | 1343 | } |
1335 | #endif | 1344 | #endif /* CONFIG_KALLSYMS */ |
1336 | 1345 | ||
1337 | #ifdef CONFIG_SYSFS | ||
1338 | static void add_usage_links(struct module *mod) | 1346 | static void add_usage_links(struct module *mod) |
1339 | { | 1347 | { |
1340 | #ifdef CONFIG_MODULE_UNLOAD | 1348 | #ifdef CONFIG_MODULE_UNLOAD |
@@ -1439,6 +1447,7 @@ out: | |||
1439 | } | 1447 | } |
1440 | 1448 | ||
1441 | static int mod_sysfs_setup(struct module *mod, | 1449 | static int mod_sysfs_setup(struct module *mod, |
1450 | const struct load_info *info, | ||
1442 | struct kernel_param *kparam, | 1451 | struct kernel_param *kparam, |
1443 | unsigned int num_params) | 1452 | unsigned int num_params) |
1444 | { | 1453 | { |
@@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod, | |||
1463 | goto out_unreg_param; | 1472 | goto out_unreg_param; |
1464 | 1473 | ||
1465 | add_usage_links(mod); | 1474 | add_usage_links(mod); |
1475 | add_sect_attrs(mod, info); | ||
1476 | add_notes_attrs(mod, info); | ||
1466 | 1477 | ||
1467 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | 1478 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); |
1468 | return 0; | 1479 | return 0; |
@@ -1479,33 +1490,26 @@ out: | |||
1479 | 1490 | ||
1480 | static void mod_sysfs_fini(struct module *mod) | 1491 | static void mod_sysfs_fini(struct module *mod) |
1481 | { | 1492 | { |
1493 | remove_notes_attrs(mod); | ||
1494 | remove_sect_attrs(mod); | ||
1482 | kobject_put(&mod->mkobj.kobj); | 1495 | kobject_put(&mod->mkobj.kobj); |
1483 | } | 1496 | } |
1484 | 1497 | ||
1485 | #else /* CONFIG_SYSFS */ | 1498 | #else /* !CONFIG_SYSFS */ |
1486 | |||
1487 | static inline int mod_sysfs_init(struct module *mod) | ||
1488 | { | ||
1489 | return 0; | ||
1490 | } | ||
1491 | 1499 | ||
1492 | static inline int mod_sysfs_setup(struct module *mod, | 1500 | static int mod_sysfs_setup(struct module *mod, |
1501 | const struct load_info *info, | ||
1493 | struct kernel_param *kparam, | 1502 | struct kernel_param *kparam, |
1494 | unsigned int num_params) | 1503 | unsigned int num_params) |
1495 | { | 1504 | { |
1496 | return 0; | 1505 | return 0; |
1497 | } | 1506 | } |
1498 | 1507 | ||
1499 | static inline int module_add_modinfo_attrs(struct module *mod) | 1508 | static void mod_sysfs_fini(struct module *mod) |
1500 | { | ||
1501 | return 0; | ||
1502 | } | ||
1503 | |||
1504 | static inline void module_remove_modinfo_attrs(struct module *mod) | ||
1505 | { | 1509 | { |
1506 | } | 1510 | } |
1507 | 1511 | ||
1508 | static void mod_sysfs_fini(struct module *mod) | 1512 | static void module_remove_modinfo_attrs(struct module *mod) |
1509 | { | 1513 | { |
1510 | } | 1514 | } |
1511 | 1515 | ||
@@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod) | |||
1515 | 1519 | ||
1516 | #endif /* CONFIG_SYSFS */ | 1520 | #endif /* CONFIG_SYSFS */ |
1517 | 1521 | ||
1518 | static void mod_kobject_remove(struct module *mod) | 1522 | static void mod_sysfs_teardown(struct module *mod) |
1519 | { | 1523 | { |
1520 | del_usage_links(mod); | 1524 | del_usage_links(mod); |
1521 | module_remove_modinfo_attrs(mod); | 1525 | module_remove_modinfo_attrs(mod); |
@@ -1533,6 +1537,7 @@ static int __unlink_module(void *_mod) | |||
1533 | { | 1537 | { |
1534 | struct module *mod = _mod; | 1538 | struct module *mod = _mod; |
1535 | list_del(&mod->list); | 1539 | list_del(&mod->list); |
1540 | module_bug_cleanup(mod); | ||
1536 | return 0; | 1541 | return 0; |
1537 | } | 1542 | } |
1538 | 1543 | ||
@@ -1545,9 +1550,7 @@ static void free_module(struct module *mod) | |||
1545 | mutex_lock(&module_mutex); | 1550 | mutex_lock(&module_mutex); |
1546 | stop_machine(__unlink_module, mod, NULL); | 1551 | stop_machine(__unlink_module, mod, NULL); |
1547 | mutex_unlock(&module_mutex); | 1552 | mutex_unlock(&module_mutex); |
1548 | remove_notes_attrs(mod); | 1553 | mod_sysfs_teardown(mod); |
1549 | remove_sect_attrs(mod); | ||
1550 | mod_kobject_remove(mod); | ||
1551 | 1554 | ||
1552 | /* Remove dynamic debug info */ | 1555 | /* Remove dynamic debug info */ |
1553 | ddebug_remove_module(mod->name); | 1556 | ddebug_remove_module(mod->name); |
@@ -1565,10 +1568,7 @@ static void free_module(struct module *mod) | |||
1565 | module_free(mod, mod->module_init); | 1568 | module_free(mod, mod->module_init); |
1566 | kfree(mod->args); | 1569 | kfree(mod->args); |
1567 | percpu_modfree(mod); | 1570 | percpu_modfree(mod); |
1568 | #if defined(CONFIG_MODULE_UNLOAD) | 1571 | |
1569 | if (mod->refptr) | ||
1570 | free_percpu(mod->refptr); | ||
1571 | #endif | ||
1572 | /* Free lock-classes: */ | 1572 | /* Free lock-classes: */ |
1573 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1573 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1574 | 1574 | ||
@@ -1634,25 +1634,23 @@ static int verify_export_symbols(struct module *mod) | |||
1634 | } | 1634 | } |
1635 | 1635 | ||
1636 | /* Change all symbols so that st_value encodes the pointer directly. */ | 1636 | /* Change all symbols so that st_value encodes the pointer directly. */ |
1637 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1637 | static int simplify_symbols(struct module *mod, const struct load_info *info) |
1638 | unsigned int symindex, | 1638 | { |
1639 | const char *strtab, | 1639 | Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; |
1640 | unsigned int versindex, | 1640 | Elf_Sym *sym = (void *)symsec->sh_addr; |
1641 | unsigned int pcpuindex, | ||
1642 | struct module *mod) | ||
1643 | { | ||
1644 | Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; | ||
1645 | unsigned long secbase; | 1641 | unsigned long secbase; |
1646 | unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | 1642 | unsigned int i; |
1647 | int ret = 0; | 1643 | int ret = 0; |
1648 | const struct kernel_symbol *ksym; | 1644 | const struct kernel_symbol *ksym; |
1649 | 1645 | ||
1650 | for (i = 1; i < n; i++) { | 1646 | for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { |
1647 | const char *name = info->strtab + sym[i].st_name; | ||
1648 | |||
1651 | switch (sym[i].st_shndx) { | 1649 | switch (sym[i].st_shndx) { |
1652 | case SHN_COMMON: | 1650 | case SHN_COMMON: |
1653 | /* We compiled with -fno-common. These are not | 1651 | /* We compiled with -fno-common. These are not |
1654 | supposed to happen. */ | 1652 | supposed to happen. */ |
1655 | DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); | 1653 | DEBUGP("Common symbol: %s\n", name); |
1656 | printk("%s: please compile with -fno-common\n", | 1654 | printk("%s: please compile with -fno-common\n", |
1657 | mod->name); | 1655 | mod->name); |
1658 | ret = -ENOEXEC; | 1656 | ret = -ENOEXEC; |
@@ -1665,9 +1663,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1665 | break; | 1663 | break; |
1666 | 1664 | ||
1667 | case SHN_UNDEF: | 1665 | case SHN_UNDEF: |
1668 | ksym = resolve_symbol_wait(sechdrs, versindex, | 1666 | ksym = resolve_symbol_wait(mod, info, name); |
1669 | strtab + sym[i].st_name, | ||
1670 | mod); | ||
1671 | /* Ok if resolved. */ | 1667 | /* Ok if resolved. */ |
1672 | if (ksym && !IS_ERR(ksym)) { | 1668 | if (ksym && !IS_ERR(ksym)) { |
1673 | sym[i].st_value = ksym->value; | 1669 | sym[i].st_value = ksym->value; |
@@ -1679,17 +1675,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1679 | break; | 1675 | break; |
1680 | 1676 | ||
1681 | printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", | 1677 | printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", |
1682 | mod->name, strtab + sym[i].st_name, | 1678 | mod->name, name, PTR_ERR(ksym)); |
1683 | PTR_ERR(ksym)); | ||
1684 | ret = PTR_ERR(ksym) ?: -ENOENT; | 1679 | ret = PTR_ERR(ksym) ?: -ENOENT; |
1685 | break; | 1680 | break; |
1686 | 1681 | ||
1687 | default: | 1682 | default: |
1688 | /* Divert to percpu allocation if a percpu var. */ | 1683 | /* Divert to percpu allocation if a percpu var. */ |
1689 | if (sym[i].st_shndx == pcpuindex) | 1684 | if (sym[i].st_shndx == info->index.pcpu) |
1690 | secbase = (unsigned long)mod_percpu(mod); | 1685 | secbase = (unsigned long)mod_percpu(mod); |
1691 | else | 1686 | else |
1692 | secbase = sechdrs[sym[i].st_shndx].sh_addr; | 1687 | secbase = info->sechdrs[sym[i].st_shndx].sh_addr; |
1693 | sym[i].st_value += secbase; | 1688 | sym[i].st_value += secbase; |
1694 | break; | 1689 | break; |
1695 | } | 1690 | } |
@@ -1698,6 +1693,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, | |||
1698 | return ret; | 1693 | return ret; |
1699 | } | 1694 | } |
1700 | 1695 | ||
1696 | static int apply_relocations(struct module *mod, const struct load_info *info) | ||
1697 | { | ||
1698 | unsigned int i; | ||
1699 | int err = 0; | ||
1700 | |||
1701 | /* Now do relocations. */ | ||
1702 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
1703 | unsigned int infosec = info->sechdrs[i].sh_info; | ||
1704 | |||
1705 | /* Not a valid relocation section? */ | ||
1706 | if (infosec >= info->hdr->e_shnum) | ||
1707 | continue; | ||
1708 | |||
1709 | /* Don't bother with non-allocated sections */ | ||
1710 | if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) | ||
1711 | continue; | ||
1712 | |||
1713 | if (info->sechdrs[i].sh_type == SHT_REL) | ||
1714 | err = apply_relocate(info->sechdrs, info->strtab, | ||
1715 | info->index.sym, i, mod); | ||
1716 | else if (info->sechdrs[i].sh_type == SHT_RELA) | ||
1717 | err = apply_relocate_add(info->sechdrs, info->strtab, | ||
1718 | info->index.sym, i, mod); | ||
1719 | if (err < 0) | ||
1720 | break; | ||
1721 | } | ||
1722 | return err; | ||
1723 | } | ||
1724 | |||
1701 | /* Additional bytes needed by arch in front of individual sections */ | 1725 | /* Additional bytes needed by arch in front of individual sections */ |
1702 | unsigned int __weak arch_mod_section_prepend(struct module *mod, | 1726 | unsigned int __weak arch_mod_section_prepend(struct module *mod, |
1703 | unsigned int section) | 1727 | unsigned int section) |
@@ -1722,10 +1746,7 @@ static long get_offset(struct module *mod, unsigned int *size, | |||
1722 | might -- code, read-only data, read-write data, small data. Tally | 1746 | might -- code, read-only data, read-write data, small data. Tally |
1723 | sizes, and place the offsets into sh_entsize fields: high bit means it | 1747 | sizes, and place the offsets into sh_entsize fields: high bit means it |
1724 | belongs in init. */ | 1748 | belongs in init. */ |
1725 | static void layout_sections(struct module *mod, | 1749 | static void layout_sections(struct module *mod, struct load_info *info) |
1726 | const Elf_Ehdr *hdr, | ||
1727 | Elf_Shdr *sechdrs, | ||
1728 | const char *secstrings) | ||
1729 | { | 1750 | { |
1730 | static unsigned long const masks[][2] = { | 1751 | static unsigned long const masks[][2] = { |
1731 | /* NOTE: all executable code must be the first section | 1752 | /* NOTE: all executable code must be the first section |
@@ -1738,21 +1759,22 @@ static void layout_sections(struct module *mod, | |||
1738 | }; | 1759 | }; |
1739 | unsigned int m, i; | 1760 | unsigned int m, i; |
1740 | 1761 | ||
1741 | for (i = 0; i < hdr->e_shnum; i++) | 1762 | for (i = 0; i < info->hdr->e_shnum; i++) |
1742 | sechdrs[i].sh_entsize = ~0UL; | 1763 | info->sechdrs[i].sh_entsize = ~0UL; |
1743 | 1764 | ||
1744 | DEBUGP("Core section allocation order:\n"); | 1765 | DEBUGP("Core section allocation order:\n"); |
1745 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 1766 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1746 | for (i = 0; i < hdr->e_shnum; ++i) { | 1767 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1747 | Elf_Shdr *s = &sechdrs[i]; | 1768 | Elf_Shdr *s = &info->sechdrs[i]; |
1769 | const char *sname = info->secstrings + s->sh_name; | ||
1748 | 1770 | ||
1749 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | 1771 | if ((s->sh_flags & masks[m][0]) != masks[m][0] |
1750 | || (s->sh_flags & masks[m][1]) | 1772 | || (s->sh_flags & masks[m][1]) |
1751 | || s->sh_entsize != ~0UL | 1773 | || s->sh_entsize != ~0UL |
1752 | || strstarts(secstrings + s->sh_name, ".init")) | 1774 | || strstarts(sname, ".init")) |
1753 | continue; | 1775 | continue; |
1754 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 1776 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1755 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1777 | DEBUGP("\t%s\n", name); |
1756 | } | 1778 | } |
1757 | if (m == 0) | 1779 | if (m == 0) |
1758 | mod->core_text_size = mod->core_size; | 1780 | mod->core_text_size = mod->core_size; |
@@ -1760,17 +1782,18 @@ static void layout_sections(struct module *mod, | |||
1760 | 1782 | ||
1761 | DEBUGP("Init section allocation order:\n"); | 1783 | DEBUGP("Init section allocation order:\n"); |
1762 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 1784 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1763 | for (i = 0; i < hdr->e_shnum; ++i) { | 1785 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1764 | Elf_Shdr *s = &sechdrs[i]; | 1786 | Elf_Shdr *s = &info->sechdrs[i]; |
1787 | const char *sname = info->secstrings + s->sh_name; | ||
1765 | 1788 | ||
1766 | if ((s->sh_flags & masks[m][0]) != masks[m][0] | 1789 | if ((s->sh_flags & masks[m][0]) != masks[m][0] |
1767 | || (s->sh_flags & masks[m][1]) | 1790 | || (s->sh_flags & masks[m][1]) |
1768 | || s->sh_entsize != ~0UL | 1791 | || s->sh_entsize != ~0UL |
1769 | || !strstarts(secstrings + s->sh_name, ".init")) | 1792 | || !strstarts(sname, ".init")) |
1770 | continue; | 1793 | continue; |
1771 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | 1794 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) |
1772 | | INIT_OFFSET_MASK); | 1795 | | INIT_OFFSET_MASK); |
1773 | DEBUGP("\t%s\n", secstrings + s->sh_name); | 1796 | DEBUGP("\t%s\n", sname); |
1774 | } | 1797 | } |
1775 | if (m == 0) | 1798 | if (m == 0) |
1776 | mod->init_text_size = mod->init_size; | 1799 | mod->init_text_size = mod->init_size; |
@@ -1809,33 +1832,28 @@ static char *next_string(char *string, unsigned long *secsize) | |||
1809 | return string; | 1832 | return string; |
1810 | } | 1833 | } |
1811 | 1834 | ||
1812 | static char *get_modinfo(Elf_Shdr *sechdrs, | 1835 | static char *get_modinfo(struct load_info *info, const char *tag) |
1813 | unsigned int info, | ||
1814 | const char *tag) | ||
1815 | { | 1836 | { |
1816 | char *p; | 1837 | char *p; |
1817 | unsigned int taglen = strlen(tag); | 1838 | unsigned int taglen = strlen(tag); |
1818 | unsigned long size = sechdrs[info].sh_size; | 1839 | Elf_Shdr *infosec = &info->sechdrs[info->index.info]; |
1840 | unsigned long size = infosec->sh_size; | ||
1819 | 1841 | ||
1820 | for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { | 1842 | for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { |
1821 | if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') | 1843 | if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') |
1822 | return p + taglen + 1; | 1844 | return p + taglen + 1; |
1823 | } | 1845 | } |
1824 | return NULL; | 1846 | return NULL; |
1825 | } | 1847 | } |
1826 | 1848 | ||
1827 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | 1849 | static void setup_modinfo(struct module *mod, struct load_info *info) |
1828 | unsigned int infoindex) | ||
1829 | { | 1850 | { |
1830 | struct module_attribute *attr; | 1851 | struct module_attribute *attr; |
1831 | int i; | 1852 | int i; |
1832 | 1853 | ||
1833 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | 1854 | for (i = 0; (attr = modinfo_attrs[i]); i++) { |
1834 | if (attr->setup) | 1855 | if (attr->setup) |
1835 | attr->setup(mod, | 1856 | attr->setup(mod, get_modinfo(info, attr->attr.name)); |
1836 | get_modinfo(sechdrs, | ||
1837 | infoindex, | ||
1838 | attr->attr.name)); | ||
1839 | } | 1857 | } |
1840 | } | 1858 | } |
1841 | 1859 | ||
@@ -1876,11 +1894,10 @@ static int is_exported(const char *name, unsigned long value, | |||
1876 | } | 1894 | } |
1877 | 1895 | ||
1878 | /* As per nm */ | 1896 | /* As per nm */ |
1879 | static char elf_type(const Elf_Sym *sym, | 1897 | static char elf_type(const Elf_Sym *sym, const struct load_info *info) |
1880 | Elf_Shdr *sechdrs, | ||
1881 | const char *secstrings, | ||
1882 | struct module *mod) | ||
1883 | { | 1898 | { |
1899 | const Elf_Shdr *sechdrs = info->sechdrs; | ||
1900 | |||
1884 | if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { | 1901 | if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { |
1885 | if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) | 1902 | if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) |
1886 | return 'v'; | 1903 | return 'v'; |
@@ -1910,8 +1927,10 @@ static char elf_type(const Elf_Sym *sym, | |||
1910 | else | 1927 | else |
1911 | return 'b'; | 1928 | return 'b'; |
1912 | } | 1929 | } |
1913 | if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) | 1930 | if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, |
1931 | ".debug")) { | ||
1914 | return 'n'; | 1932 | return 'n'; |
1933 | } | ||
1915 | return '?'; | 1934 | return '?'; |
1916 | } | 1935 | } |
1917 | 1936 | ||
@@ -1936,127 +1955,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | |||
1936 | return true; | 1955 | return true; |
1937 | } | 1956 | } |
1938 | 1957 | ||
1939 | static unsigned long layout_symtab(struct module *mod, | 1958 | static void layout_symtab(struct module *mod, struct load_info *info) |
1940 | Elf_Shdr *sechdrs, | ||
1941 | unsigned int symindex, | ||
1942 | unsigned int strindex, | ||
1943 | const Elf_Ehdr *hdr, | ||
1944 | const char *secstrings, | ||
1945 | unsigned long *pstroffs, | ||
1946 | unsigned long *strmap) | ||
1947 | { | 1959 | { |
1948 | unsigned long symoffs; | 1960 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
1949 | Elf_Shdr *symsect = sechdrs + symindex; | 1961 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
1950 | Elf_Shdr *strsect = sechdrs + strindex; | ||
1951 | const Elf_Sym *src; | 1962 | const Elf_Sym *src; |
1952 | const char *strtab; | ||
1953 | unsigned int i, nsrc, ndst; | 1963 | unsigned int i, nsrc, ndst; |
1954 | 1964 | ||
1955 | /* Put symbol section at end of init part of module. */ | 1965 | /* Put symbol section at end of init part of module. */ |
1956 | symsect->sh_flags |= SHF_ALLOC; | 1966 | symsect->sh_flags |= SHF_ALLOC; |
1957 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, | 1967 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, |
1958 | symindex) | INIT_OFFSET_MASK; | 1968 | info->index.sym) | INIT_OFFSET_MASK; |
1959 | DEBUGP("\t%s\n", secstrings + symsect->sh_name); | 1969 | DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); |
1960 | 1970 | ||
1961 | src = (void *)hdr + symsect->sh_offset; | 1971 | src = (void *)info->hdr + symsect->sh_offset; |
1962 | nsrc = symsect->sh_size / sizeof(*src); | 1972 | nsrc = symsect->sh_size / sizeof(*src); |
1963 | strtab = (void *)hdr + strsect->sh_offset; | ||
1964 | for (ndst = i = 1; i < nsrc; ++i, ++src) | 1973 | for (ndst = i = 1; i < nsrc; ++i, ++src) |
1965 | if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { | 1974 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { |
1966 | unsigned int j = src->st_name; | 1975 | unsigned int j = src->st_name; |
1967 | 1976 | ||
1968 | while(!__test_and_set_bit(j, strmap) && strtab[j]) | 1977 | while (!__test_and_set_bit(j, info->strmap) |
1978 | && info->strtab[j]) | ||
1969 | ++j; | 1979 | ++j; |
1970 | ++ndst; | 1980 | ++ndst; |
1971 | } | 1981 | } |
1972 | 1982 | ||
1973 | /* Append room for core symbols at end of core part. */ | 1983 | /* Append room for core symbols at end of core part. */ |
1974 | symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 1984 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
1975 | mod->core_size = symoffs + ndst * sizeof(Elf_Sym); | 1985 | mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); |
1976 | 1986 | ||
1977 | /* Put string table section at end of init part of module. */ | 1987 | /* Put string table section at end of init part of module. */ |
1978 | strsect->sh_flags |= SHF_ALLOC; | 1988 | strsect->sh_flags |= SHF_ALLOC; |
1979 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, | 1989 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, |
1980 | strindex) | INIT_OFFSET_MASK; | 1990 | info->index.str) | INIT_OFFSET_MASK; |
1981 | DEBUGP("\t%s\n", secstrings + strsect->sh_name); | 1991 | DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); |
1982 | 1992 | ||
1983 | /* Append room for core symbols' strings at end of core part. */ | 1993 | /* Append room for core symbols' strings at end of core part. */ |
1984 | *pstroffs = mod->core_size; | 1994 | info->stroffs = mod->core_size; |
1985 | __set_bit(0, strmap); | 1995 | __set_bit(0, info->strmap); |
1986 | mod->core_size += bitmap_weight(strmap, strsect->sh_size); | 1996 | mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); |
1987 | |||
1988 | return symoffs; | ||
1989 | } | 1997 | } |
1990 | 1998 | ||
1991 | static void add_kallsyms(struct module *mod, | 1999 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
1992 | Elf_Shdr *sechdrs, | ||
1993 | unsigned int shnum, | ||
1994 | unsigned int symindex, | ||
1995 | unsigned int strindex, | ||
1996 | unsigned long symoffs, | ||
1997 | unsigned long stroffs, | ||
1998 | const char *secstrings, | ||
1999 | unsigned long *strmap) | ||
2000 | { | 2000 | { |
2001 | unsigned int i, ndst; | 2001 | unsigned int i, ndst; |
2002 | const Elf_Sym *src; | 2002 | const Elf_Sym *src; |
2003 | Elf_Sym *dst; | 2003 | Elf_Sym *dst; |
2004 | char *s; | 2004 | char *s; |
2005 | Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; | ||
2005 | 2006 | ||
2006 | mod->symtab = (void *)sechdrs[symindex].sh_addr; | 2007 | mod->symtab = (void *)symsec->sh_addr; |
2007 | mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); | 2008 | mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); |
2008 | mod->strtab = (void *)sechdrs[strindex].sh_addr; | 2009 | /* Make sure we get permanent strtab: don't use info->strtab. */ |
2010 | mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; | ||
2009 | 2011 | ||
2010 | /* Set types up while we still have access to sections. */ | 2012 | /* Set types up while we still have access to sections. */ |
2011 | for (i = 0; i < mod->num_symtab; i++) | 2013 | for (i = 0; i < mod->num_symtab; i++) |
2012 | mod->symtab[i].st_info | 2014 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); |
2013 | = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); | ||
2014 | 2015 | ||
2015 | mod->core_symtab = dst = mod->module_core + symoffs; | 2016 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2016 | src = mod->symtab; | 2017 | src = mod->symtab; |
2017 | *dst = *src; | 2018 | *dst = *src; |
2018 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2019 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { |
2019 | if (!is_core_symbol(src, sechdrs, shnum)) | 2020 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) |
2020 | continue; | 2021 | continue; |
2021 | dst[ndst] = *src; | 2022 | dst[ndst] = *src; |
2022 | dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); | 2023 | dst[ndst].st_name = bitmap_weight(info->strmap, |
2024 | dst[ndst].st_name); | ||
2023 | ++ndst; | 2025 | ++ndst; |
2024 | } | 2026 | } |
2025 | mod->core_num_syms = ndst; | 2027 | mod->core_num_syms = ndst; |
2026 | 2028 | ||
2027 | mod->core_strtab = s = mod->module_core + stroffs; | 2029 | mod->core_strtab = s = mod->module_core + info->stroffs; |
2028 | for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) | 2030 | for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) |
2029 | if (test_bit(i, strmap)) | 2031 | if (test_bit(i, info->strmap)) |
2030 | *++s = mod->strtab[i]; | 2032 | *++s = mod->strtab[i]; |
2031 | } | 2033 | } |
2032 | #else | 2034 | #else |
2033 | static inline unsigned long layout_symtab(struct module *mod, | 2035 | static inline void layout_symtab(struct module *mod, struct load_info *info) |
2034 | Elf_Shdr *sechdrs, | ||
2035 | unsigned int symindex, | ||
2036 | unsigned int strindex, | ||
2037 | const Elf_Ehdr *hdr, | ||
2038 | const char *secstrings, | ||
2039 | unsigned long *pstroffs, | ||
2040 | unsigned long *strmap) | ||
2041 | { | 2036 | { |
2042 | return 0; | ||
2043 | } | 2037 | } |
2044 | 2038 | ||
2045 | static inline void add_kallsyms(struct module *mod, | 2039 | static void add_kallsyms(struct module *mod, struct load_info *info) |
2046 | Elf_Shdr *sechdrs, | ||
2047 | unsigned int shnum, | ||
2048 | unsigned int symindex, | ||
2049 | unsigned int strindex, | ||
2050 | unsigned long symoffs, | ||
2051 | unsigned long stroffs, | ||
2052 | const char *secstrings, | ||
2053 | const unsigned long *strmap) | ||
2054 | { | 2040 | { |
2055 | } | 2041 | } |
2056 | #endif /* CONFIG_KALLSYMS */ | 2042 | #endif /* CONFIG_KALLSYMS */ |
2057 | 2043 | ||
2058 | static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) | 2044 | static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) |
2059 | { | 2045 | { |
2046 | if (!debug) | ||
2047 | return; | ||
2060 | #ifdef CONFIG_DYNAMIC_DEBUG | 2048 | #ifdef CONFIG_DYNAMIC_DEBUG |
2061 | if (ddebug_add_module(debug, num, debug->modname)) | 2049 | if (ddebug_add_module(debug, num, debug->modname)) |
2062 | printk(KERN_ERR "dynamic debug error adding module: %s\n", | 2050 | printk(KERN_ERR "dynamic debug error adding module: %s\n", |
@@ -2087,65 +2075,47 @@ static void *module_alloc_update_bounds(unsigned long size) | |||
2087 | } | 2075 | } |
2088 | 2076 | ||
2089 | #ifdef CONFIG_DEBUG_KMEMLEAK | 2077 | #ifdef CONFIG_DEBUG_KMEMLEAK |
2090 | static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, | 2078 | static void kmemleak_load_module(const struct module *mod, |
2091 | Elf_Shdr *sechdrs, char *secstrings) | 2079 | const struct load_info *info) |
2092 | { | 2080 | { |
2093 | unsigned int i; | 2081 | unsigned int i; |
2094 | 2082 | ||
2095 | /* only scan the sections containing data */ | 2083 | /* only scan the sections containing data */ |
2096 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); | 2084 | kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); |
2097 | 2085 | ||
2098 | for (i = 1; i < hdr->e_shnum; i++) { | 2086 | for (i = 1; i < info->hdr->e_shnum; i++) { |
2099 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | 2087 | const char *name = info->secstrings + info->sechdrs[i].sh_name; |
2088 | if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) | ||
2100 | continue; | 2089 | continue; |
2101 | if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 | 2090 | if (!strstarts(name, ".data") && !strstarts(name, ".bss")) |
2102 | && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) | ||
2103 | continue; | 2091 | continue; |
2104 | 2092 | ||
2105 | kmemleak_scan_area((void *)sechdrs[i].sh_addr, | 2093 | kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, |
2106 | sechdrs[i].sh_size, GFP_KERNEL); | 2094 | info->sechdrs[i].sh_size, GFP_KERNEL); |
2107 | } | 2095 | } |
2108 | } | 2096 | } |
2109 | #else | 2097 | #else |
2110 | static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, | 2098 | static inline void kmemleak_load_module(const struct module *mod, |
2111 | Elf_Shdr *sechdrs, char *secstrings) | 2099 | const struct load_info *info) |
2112 | { | 2100 | { |
2113 | } | 2101 | } |
2114 | #endif | 2102 | #endif |
2115 | 2103 | ||
2116 | /* Allocate and load the module: note that size of section 0 is always | 2104 | /* Sets info->hdr and info->len. */ |
2117 | zero, and we rely on this for optional sections. */ | 2105 | static int copy_and_check(struct load_info *info, |
2118 | static noinline struct module *load_module(void __user *umod, | 2106 | const void __user *umod, unsigned long len, |
2119 | unsigned long len, | 2107 | const char __user *uargs) |
2120 | const char __user *uargs) | ||
2121 | { | 2108 | { |
2109 | int err; | ||
2122 | Elf_Ehdr *hdr; | 2110 | Elf_Ehdr *hdr; |
2123 | Elf_Shdr *sechdrs; | ||
2124 | char *secstrings, *args, *modmagic, *strtab = NULL; | ||
2125 | char *staging; | ||
2126 | unsigned int i; | ||
2127 | unsigned int symindex = 0; | ||
2128 | unsigned int strindex = 0; | ||
2129 | unsigned int modindex, versindex, infoindex, pcpuindex; | ||
2130 | struct module *mod; | ||
2131 | long err = 0; | ||
2132 | void *ptr = NULL; /* Stops spurious gcc warning */ | ||
2133 | unsigned long symoffs, stroffs, *strmap; | ||
2134 | void __percpu *percpu; | ||
2135 | struct _ddebug *debug = NULL; | ||
2136 | unsigned int num_debug = 0; | ||
2137 | 2111 | ||
2138 | mm_segment_t old_fs; | ||
2139 | |||
2140 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | ||
2141 | umod, len, uargs); | ||
2142 | if (len < sizeof(*hdr)) | 2112 | if (len < sizeof(*hdr)) |
2143 | return ERR_PTR(-ENOEXEC); | 2113 | return -ENOEXEC; |
2144 | 2114 | ||
2145 | /* Suck in entire file: we'll want most of it. */ | 2115 | /* Suck in entire file: we'll want most of it. */ |
2146 | /* vmalloc barfs on "unusual" numbers. Check here */ | 2116 | /* vmalloc barfs on "unusual" numbers. Check here */ |
2147 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) | 2117 | if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) |
2148 | return ERR_PTR(-ENOMEM); | 2118 | return -ENOMEM; |
2149 | 2119 | ||
2150 | if (copy_from_user(hdr, umod, len) != 0) { | 2120 | if (copy_from_user(hdr, umod, len) != 0) { |
2151 | err = -EFAULT; | 2121 | err = -EFAULT; |
@@ -2153,135 +2123,225 @@ static noinline struct module *load_module(void __user *umod, | |||
2153 | } | 2123 | } |
2154 | 2124 | ||
2155 | /* Sanity checks against insmoding binaries or wrong arch, | 2125 | /* Sanity checks against insmoding binaries or wrong arch, |
2156 | weird elf version */ | 2126 | weird elf version */ |
2157 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 | 2127 | if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 |
2158 | || hdr->e_type != ET_REL | 2128 | || hdr->e_type != ET_REL |
2159 | || !elf_check_arch(hdr) | 2129 | || !elf_check_arch(hdr) |
2160 | || hdr->e_shentsize != sizeof(*sechdrs)) { | 2130 | || hdr->e_shentsize != sizeof(Elf_Shdr)) { |
2161 | err = -ENOEXEC; | 2131 | err = -ENOEXEC; |
2162 | goto free_hdr; | 2132 | goto free_hdr; |
2163 | } | 2133 | } |
2164 | 2134 | ||
2165 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) | 2135 | if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { |
2166 | goto truncated; | 2136 | err = -ENOEXEC; |
2137 | goto free_hdr; | ||
2138 | } | ||
2167 | 2139 | ||
2168 | /* Convenience variables */ | 2140 | info->hdr = hdr; |
2169 | sechdrs = (void *)hdr + hdr->e_shoff; | 2141 | info->len = len; |
2170 | secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; | 2142 | return 0; |
2171 | sechdrs[0].sh_addr = 0; | ||
2172 | 2143 | ||
2173 | for (i = 1; i < hdr->e_shnum; i++) { | 2144 | free_hdr: |
2174 | if (sechdrs[i].sh_type != SHT_NOBITS | 2145 | vfree(hdr); |
2175 | && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) | 2146 | return err; |
2176 | goto truncated; | 2147 | } |
2148 | |||
2149 | static void free_copy(struct load_info *info) | ||
2150 | { | ||
2151 | vfree(info->hdr); | ||
2152 | } | ||
2153 | |||
2154 | static int rewrite_section_headers(struct load_info *info) | ||
2155 | { | ||
2156 | unsigned int i; | ||
2157 | |||
2158 | /* This should always be true, but let's be sure. */ | ||
2159 | info->sechdrs[0].sh_addr = 0; | ||
2160 | |||
2161 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
2162 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
2163 | if (shdr->sh_type != SHT_NOBITS | ||
2164 | && info->len < shdr->sh_offset + shdr->sh_size) { | ||
2165 | printk(KERN_ERR "Module len %lu truncated\n", | ||
2166 | info->len); | ||
2167 | return -ENOEXEC; | ||
2168 | } | ||
2177 | 2169 | ||
2178 | /* Mark all sections sh_addr with their address in the | 2170 | /* Mark all sections sh_addr with their address in the |
2179 | temporary image. */ | 2171 | temporary image. */ |
2180 | sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; | 2172 | shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; |
2181 | 2173 | ||
2182 | /* Internal symbols and strings. */ | ||
2183 | if (sechdrs[i].sh_type == SHT_SYMTAB) { | ||
2184 | symindex = i; | ||
2185 | strindex = sechdrs[i].sh_link; | ||
2186 | strtab = (char *)hdr + sechdrs[strindex].sh_offset; | ||
2187 | } | ||
2188 | #ifndef CONFIG_MODULE_UNLOAD | 2174 | #ifndef CONFIG_MODULE_UNLOAD |
2189 | /* Don't load .exit sections */ | 2175 | /* Don't load .exit sections */ |
2190 | if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) | 2176 | if (strstarts(info->secstrings+shdr->sh_name, ".exit")) |
2191 | sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2177 | shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; |
2192 | #endif | 2178 | #endif |
2193 | } | 2179 | } |
2194 | 2180 | ||
2195 | modindex = find_sec(hdr, sechdrs, secstrings, | 2181 | /* Track but don't keep modinfo and version sections. */ |
2196 | ".gnu.linkonce.this_module"); | 2182 | info->index.vers = find_sec(info, "__versions"); |
2197 | if (!modindex) { | 2183 | info->index.info = find_sec(info, ".modinfo"); |
2184 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2185 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2186 | return 0; | ||
2187 | } | ||
2188 | |||
2189 | /* | ||
2190 | * Set up our basic convenience variables (pointers to section headers, | ||
2191 | * search for module section index etc), and do some basic section | ||
2192 | * verification. | ||
2193 | * | ||
2194 | * Return the temporary module pointer (we'll replace it with the final | ||
2195 | * one when we move the module sections around). | ||
2196 | */ | ||
2197 | static struct module *setup_load_info(struct load_info *info) | ||
2198 | { | ||
2199 | unsigned int i; | ||
2200 | int err; | ||
2201 | struct module *mod; | ||
2202 | |||
2203 | /* Set up the convenience variables */ | ||
2204 | info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; | ||
2205 | info->secstrings = (void *)info->hdr | ||
2206 | + info->sechdrs[info->hdr->e_shstrndx].sh_offset; | ||
2207 | |||
2208 | err = rewrite_section_headers(info); | ||
2209 | if (err) | ||
2210 | return ERR_PTR(err); | ||
2211 | |||
2212 | /* Find internal symbols and strings. */ | ||
2213 | for (i = 1; i < info->hdr->e_shnum; i++) { | ||
2214 | if (info->sechdrs[i].sh_type == SHT_SYMTAB) { | ||
2215 | info->index.sym = i; | ||
2216 | info->index.str = info->sechdrs[i].sh_link; | ||
2217 | info->strtab = (char *)info->hdr | ||
2218 | + info->sechdrs[info->index.str].sh_offset; | ||
2219 | break; | ||
2220 | } | ||
2221 | } | ||
2222 | |||
2223 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); | ||
2224 | if (!info->index.mod) { | ||
2198 | printk(KERN_WARNING "No module found in object\n"); | 2225 | printk(KERN_WARNING "No module found in object\n"); |
2199 | err = -ENOEXEC; | 2226 | return ERR_PTR(-ENOEXEC); |
2200 | goto free_hdr; | ||
2201 | } | 2227 | } |
2202 | /* This is temporary: point mod into copy of data. */ | 2228 | /* This is temporary: point mod into copy of data. */ |
2203 | mod = (void *)sechdrs[modindex].sh_addr; | 2229 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2204 | 2230 | ||
2205 | if (symindex == 0) { | 2231 | if (info->index.sym == 0) { |
2206 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", | 2232 | printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", |
2207 | mod->name); | 2233 | mod->name); |
2208 | err = -ENOEXEC; | 2234 | return ERR_PTR(-ENOEXEC); |
2209 | goto free_hdr; | ||
2210 | } | 2235 | } |
2211 | 2236 | ||
2212 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 2237 | info->index.pcpu = find_pcpusec(info); |
2213 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | ||
2214 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | ||
2215 | |||
2216 | /* Don't keep modinfo and version sections. */ | ||
2217 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2218 | sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2219 | 2238 | ||
2220 | /* Check module struct version now, before we try to use module. */ | 2239 | /* Check module struct version now, before we try to use module. */ |
2221 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 2240 | if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) |
2222 | err = -ENOEXEC; | 2241 | return ERR_PTR(-ENOEXEC); |
2223 | goto free_hdr; | 2242 | |
2224 | } | 2243 | return mod; |
2244 | } | ||
2245 | |||
2246 | static int check_modinfo(struct module *mod, struct load_info *info) | ||
2247 | { | ||
2248 | const char *modmagic = get_modinfo(info, "vermagic"); | ||
2249 | int err; | ||
2225 | 2250 | ||
2226 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); | ||
2227 | /* This is allowed: modprobe --force will invalidate it. */ | 2251 | /* This is allowed: modprobe --force will invalidate it. */ |
2228 | if (!modmagic) { | 2252 | if (!modmagic) { |
2229 | err = try_to_force_load(mod, "bad vermagic"); | 2253 | err = try_to_force_load(mod, "bad vermagic"); |
2230 | if (err) | 2254 | if (err) |
2231 | goto free_hdr; | 2255 | return err; |
2232 | } else if (!same_magic(modmagic, vermagic, versindex)) { | 2256 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { |
2233 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", | 2257 | printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", |
2234 | mod->name, modmagic, vermagic); | 2258 | mod->name, modmagic, vermagic); |
2235 | err = -ENOEXEC; | 2259 | return -ENOEXEC; |
2236 | goto free_hdr; | ||
2237 | } | 2260 | } |
2238 | 2261 | ||
2239 | staging = get_modinfo(sechdrs, infoindex, "staging"); | 2262 | if (get_modinfo(info, "staging")) { |
2240 | if (staging) { | ||
2241 | add_taint_module(mod, TAINT_CRAP); | 2263 | add_taint_module(mod, TAINT_CRAP); |
2242 | printk(KERN_WARNING "%s: module is from the staging directory," | 2264 | printk(KERN_WARNING "%s: module is from the staging directory," |
2243 | " the quality is unknown, you have been warned.\n", | 2265 | " the quality is unknown, you have been warned.\n", |
2244 | mod->name); | 2266 | mod->name); |
2245 | } | 2267 | } |
2246 | 2268 | ||
2247 | /* Now copy in args */ | 2269 | /* Set up license info based on the info section */ |
2248 | args = strndup_user(uargs, ~0UL >> 1); | 2270 | set_license(mod, get_modinfo(info, "license")); |
2249 | if (IS_ERR(args)) { | ||
2250 | err = PTR_ERR(args); | ||
2251 | goto free_hdr; | ||
2252 | } | ||
2253 | 2271 | ||
2254 | strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) | 2272 | return 0; |
2255 | * sizeof(long), GFP_KERNEL); | 2273 | } |
2256 | if (!strmap) { | ||
2257 | err = -ENOMEM; | ||
2258 | goto free_mod; | ||
2259 | } | ||
2260 | 2274 | ||
2261 | mod->state = MODULE_STATE_COMING; | 2275 | static void find_module_sections(struct module *mod, struct load_info *info) |
2276 | { | ||
2277 | mod->kp = section_objs(info, "__param", | ||
2278 | sizeof(*mod->kp), &mod->num_kp); | ||
2279 | mod->syms = section_objs(info, "__ksymtab", | ||
2280 | sizeof(*mod->syms), &mod->num_syms); | ||
2281 | mod->crcs = section_addr(info, "__kcrctab"); | ||
2282 | mod->gpl_syms = section_objs(info, "__ksymtab_gpl", | ||
2283 | sizeof(*mod->gpl_syms), | ||
2284 | &mod->num_gpl_syms); | ||
2285 | mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); | ||
2286 | mod->gpl_future_syms = section_objs(info, | ||
2287 | "__ksymtab_gpl_future", | ||
2288 | sizeof(*mod->gpl_future_syms), | ||
2289 | &mod->num_gpl_future_syms); | ||
2290 | mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); | ||
2262 | 2291 | ||
2263 | /* Allow arches to frob section contents and sizes. */ | 2292 | #ifdef CONFIG_UNUSED_SYMBOLS |
2264 | err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); | 2293 | mod->unused_syms = section_objs(info, "__ksymtab_unused", |
2265 | if (err < 0) | 2294 | sizeof(*mod->unused_syms), |
2266 | goto free_mod; | 2295 | &mod->num_unused_syms); |
2296 | mod->unused_crcs = section_addr(info, "__kcrctab_unused"); | ||
2297 | mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", | ||
2298 | sizeof(*mod->unused_gpl_syms), | ||
2299 | &mod->num_unused_gpl_syms); | ||
2300 | mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); | ||
2301 | #endif | ||
2302 | #ifdef CONFIG_CONSTRUCTORS | ||
2303 | mod->ctors = section_objs(info, ".ctors", | ||
2304 | sizeof(*mod->ctors), &mod->num_ctors); | ||
2305 | #endif | ||
2267 | 2306 | ||
2268 | if (pcpuindex) { | 2307 | #ifdef CONFIG_TRACEPOINTS |
2269 | /* We have a special allocation for this section. */ | 2308 | mod->tracepoints = section_objs(info, "__tracepoints", |
2270 | err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, | 2309 | sizeof(*mod->tracepoints), |
2271 | sechdrs[pcpuindex].sh_addralign); | 2310 | &mod->num_tracepoints); |
2272 | if (err) | 2311 | #endif |
2273 | goto free_mod; | 2312 | #ifdef CONFIG_EVENT_TRACING |
2274 | sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2313 | mod->trace_events = section_objs(info, "_ftrace_events", |
2275 | } | 2314 | sizeof(*mod->trace_events), |
2276 | /* Keep this around for failure path. */ | 2315 | &mod->num_trace_events); |
2277 | percpu = mod_percpu(mod); | 2316 | /* |
2317 | * This section contains pointers to allocated objects in the trace | ||
2318 | * code and not scanning it leads to false positives. | ||
2319 | */ | ||
2320 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | ||
2321 | mod->num_trace_events, GFP_KERNEL); | ||
2322 | #endif | ||
2323 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | ||
2324 | /* sechdrs[0].sh_size is always zero */ | ||
2325 | mod->ftrace_callsites = section_objs(info, "__mcount_loc", | ||
2326 | sizeof(*mod->ftrace_callsites), | ||
2327 | &mod->num_ftrace_callsites); | ||
2328 | #endif | ||
2278 | 2329 | ||
2279 | /* Determine total sizes, and put offsets in sh_entsize. For now | 2330 | mod->extable = section_objs(info, "__ex_table", |
2280 | this is done generically; there doesn't appear to be any | 2331 | sizeof(*mod->extable), &mod->num_exentries); |
2281 | special cases for the architectures. */ | 2332 | |
2282 | layout_sections(mod, hdr, sechdrs, secstrings); | 2333 | if (section_addr(info, "__obsparm")) |
2283 | symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, | 2334 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
2284 | secstrings, &stroffs, strmap); | 2335 | mod->name); |
2336 | |||
2337 | info->debug = section_objs(info, "__verbose", | ||
2338 | sizeof(*info->debug), &info->num_debug); | ||
2339 | } | ||
2340 | |||
2341 | static int move_module(struct module *mod, struct load_info *info) | ||
2342 | { | ||
2343 | int i; | ||
2344 | void *ptr; | ||
2285 | 2345 | ||
2286 | /* Do the allocs. */ | 2346 | /* Do the allocs. */ |
2287 | ptr = module_alloc_update_bounds(mod->core_size); | 2347 | ptr = module_alloc_update_bounds(mod->core_size); |
@@ -2291,10 +2351,9 @@ static noinline struct module *load_module(void __user *umod, | |||
2291 | * leak. | 2351 | * leak. |
2292 | */ | 2352 | */ |
2293 | kmemleak_not_leak(ptr); | 2353 | kmemleak_not_leak(ptr); |
2294 | if (!ptr) { | 2354 | if (!ptr) |
2295 | err = -ENOMEM; | 2355 | return -ENOMEM; |
2296 | goto free_percpu; | 2356 | |
2297 | } | ||
2298 | memset(ptr, 0, mod->core_size); | 2357 | memset(ptr, 0, mod->core_size); |
2299 | mod->module_core = ptr; | 2358 | mod->module_core = ptr; |
2300 | 2359 | ||
@@ -2307,50 +2366,40 @@ static noinline struct module *load_module(void __user *umod, | |||
2307 | */ | 2366 | */ |
2308 | kmemleak_ignore(ptr); | 2367 | kmemleak_ignore(ptr); |
2309 | if (!ptr && mod->init_size) { | 2368 | if (!ptr && mod->init_size) { |
2310 | err = -ENOMEM; | 2369 | module_free(mod, mod->module_core); |
2311 | goto free_core; | 2370 | return -ENOMEM; |
2312 | } | 2371 | } |
2313 | memset(ptr, 0, mod->init_size); | 2372 | memset(ptr, 0, mod->init_size); |
2314 | mod->module_init = ptr; | 2373 | mod->module_init = ptr; |
2315 | 2374 | ||
2316 | /* Transfer each section which specifies SHF_ALLOC */ | 2375 | /* Transfer each section which specifies SHF_ALLOC */ |
2317 | DEBUGP("final section addresses:\n"); | 2376 | DEBUGP("final section addresses:\n"); |
2318 | for (i = 0; i < hdr->e_shnum; i++) { | 2377 | for (i = 0; i < info->hdr->e_shnum; i++) { |
2319 | void *dest; | 2378 | void *dest; |
2379 | Elf_Shdr *shdr = &info->sechdrs[i]; | ||
2320 | 2380 | ||
2321 | if (!(sechdrs[i].sh_flags & SHF_ALLOC)) | 2381 | if (!(shdr->sh_flags & SHF_ALLOC)) |
2322 | continue; | 2382 | continue; |
2323 | 2383 | ||
2324 | if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) | 2384 | if (shdr->sh_entsize & INIT_OFFSET_MASK) |
2325 | dest = mod->module_init | 2385 | dest = mod->module_init |
2326 | + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); | 2386 | + (shdr->sh_entsize & ~INIT_OFFSET_MASK); |
2327 | else | 2387 | else |
2328 | dest = mod->module_core + sechdrs[i].sh_entsize; | 2388 | dest = mod->module_core + shdr->sh_entsize; |
2329 | 2389 | ||
2330 | if (sechdrs[i].sh_type != SHT_NOBITS) | 2390 | if (shdr->sh_type != SHT_NOBITS) |
2331 | memcpy(dest, (void *)sechdrs[i].sh_addr, | 2391 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); |
2332 | sechdrs[i].sh_size); | ||
2333 | /* Update sh_addr to point to copy in image. */ | 2392 | /* Update sh_addr to point to copy in image. */ |
2334 | sechdrs[i].sh_addr = (unsigned long)dest; | 2393 | shdr->sh_addr = (unsigned long)dest; |
2335 | DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); | 2394 | DEBUGP("\t0x%lx %s\n", |
2336 | } | 2395 | shdr->sh_addr, info->secstrings + shdr->sh_name); |
2337 | /* Module has been moved. */ | ||
2338 | mod = (void *)sechdrs[modindex].sh_addr; | ||
2339 | kmemleak_load_module(mod, hdr, sechdrs, secstrings); | ||
2340 | |||
2341 | #if defined(CONFIG_MODULE_UNLOAD) | ||
2342 | mod->refptr = alloc_percpu(struct module_ref); | ||
2343 | if (!mod->refptr) { | ||
2344 | err = -ENOMEM; | ||
2345 | goto free_init; | ||
2346 | } | 2396 | } |
2347 | #endif | ||
2348 | /* Now we've moved module, initialize linked lists, etc. */ | ||
2349 | module_unload_init(mod); | ||
2350 | 2397 | ||
2351 | /* Set up license info based on the info section */ | 2398 | return 0; |
2352 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 2399 | } |
2353 | 2400 | ||
2401 | static int check_module_license_and_versions(struct module *mod) | ||
2402 | { | ||
2354 | /* | 2403 | /* |
2355 | * ndiswrapper is under GPL by itself, but loads proprietary modules. | 2404 | * ndiswrapper is under GPL by itself, but loads proprietary modules. |
2356 | * Don't use add_taint_module(), as it would prevent ndiswrapper from | 2405 | * Don't use add_taint_module(), as it would prevent ndiswrapper from |
@@ -2363,77 +2412,6 @@ static noinline struct module *load_module(void __user *umod, | |||
2363 | if (strcmp(mod->name, "driverloader") == 0) | 2412 | if (strcmp(mod->name, "driverloader") == 0) |
2364 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 2413 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
2365 | 2414 | ||
2366 | /* Set up MODINFO_ATTR fields */ | ||
2367 | setup_modinfo(mod, sechdrs, infoindex); | ||
2368 | |||
2369 | /* Fix up syms, so that st_value is a pointer to location. */ | ||
2370 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | ||
2371 | mod); | ||
2372 | if (err < 0) | ||
2373 | goto cleanup; | ||
2374 | |||
2375 | /* Now we've got everything in the final locations, we can | ||
2376 | * find optional sections. */ | ||
2377 | mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", | ||
2378 | sizeof(*mod->kp), &mod->num_kp); | ||
2379 | mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", | ||
2380 | sizeof(*mod->syms), &mod->num_syms); | ||
2381 | mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); | ||
2382 | mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", | ||
2383 | sizeof(*mod->gpl_syms), | ||
2384 | &mod->num_gpl_syms); | ||
2385 | mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | ||
2386 | mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, | ||
2387 | "__ksymtab_gpl_future", | ||
2388 | sizeof(*mod->gpl_future_syms), | ||
2389 | &mod->num_gpl_future_syms); | ||
2390 | mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2391 | "__kcrctab_gpl_future"); | ||
2392 | |||
2393 | #ifdef CONFIG_UNUSED_SYMBOLS | ||
2394 | mod->unused_syms = section_objs(hdr, sechdrs, secstrings, | ||
2395 | "__ksymtab_unused", | ||
2396 | sizeof(*mod->unused_syms), | ||
2397 | &mod->num_unused_syms); | ||
2398 | mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2399 | "__kcrctab_unused"); | ||
2400 | mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, | ||
2401 | "__ksymtab_unused_gpl", | ||
2402 | sizeof(*mod->unused_gpl_syms), | ||
2403 | &mod->num_unused_gpl_syms); | ||
2404 | mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, | ||
2405 | "__kcrctab_unused_gpl"); | ||
2406 | #endif | ||
2407 | #ifdef CONFIG_CONSTRUCTORS | ||
2408 | mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", | ||
2409 | sizeof(*mod->ctors), &mod->num_ctors); | ||
2410 | #endif | ||
2411 | |||
2412 | #ifdef CONFIG_TRACEPOINTS | ||
2413 | mod->tracepoints = section_objs(hdr, sechdrs, secstrings, | ||
2414 | "__tracepoints", | ||
2415 | sizeof(*mod->tracepoints), | ||
2416 | &mod->num_tracepoints); | ||
2417 | #endif | ||
2418 | #ifdef CONFIG_EVENT_TRACING | ||
2419 | mod->trace_events = section_objs(hdr, sechdrs, secstrings, | ||
2420 | "_ftrace_events", | ||
2421 | sizeof(*mod->trace_events), | ||
2422 | &mod->num_trace_events); | ||
2423 | /* | ||
2424 | * This section contains pointers to allocated objects in the trace | ||
2425 | * code and not scanning it leads to false positives. | ||
2426 | */ | ||
2427 | kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * | ||
2428 | mod->num_trace_events, GFP_KERNEL); | ||
2429 | #endif | ||
2430 | #ifdef CONFIG_FTRACE_MCOUNT_RECORD | ||
2431 | /* sechdrs[0].sh_size is always zero */ | ||
2432 | mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, | ||
2433 | "__mcount_loc", | ||
2434 | sizeof(*mod->ftrace_callsites), | ||
2435 | &mod->num_ftrace_callsites); | ||
2436 | #endif | ||
2437 | #ifdef CONFIG_MODVERSIONS | 2415 | #ifdef CONFIG_MODVERSIONS |
2438 | if ((mod->num_syms && !mod->crcs) | 2416 | if ((mod->num_syms && !mod->crcs) |
2439 | || (mod->num_gpl_syms && !mod->gpl_crcs) | 2417 | || (mod->num_gpl_syms && !mod->gpl_crcs) |
@@ -2443,56 +2421,16 @@ static noinline struct module *load_module(void __user *umod, | |||
2443 | || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) | 2421 | || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) |
2444 | #endif | 2422 | #endif |
2445 | ) { | 2423 | ) { |
2446 | err = try_to_force_load(mod, | 2424 | return try_to_force_load(mod, |
2447 | "no versions for exported symbols"); | 2425 | "no versions for exported symbols"); |
2448 | if (err) | ||
2449 | goto cleanup; | ||
2450 | } | 2426 | } |
2451 | #endif | 2427 | #endif |
2428 | return 0; | ||
2429 | } | ||
2452 | 2430 | ||
2453 | /* Now do relocations. */ | 2431 | static void flush_module_icache(const struct module *mod) |
2454 | for (i = 1; i < hdr->e_shnum; i++) { | 2432 | { |
2455 | const char *strtab = (char *)sechdrs[strindex].sh_addr; | 2433 | mm_segment_t old_fs; |
2456 | unsigned int info = sechdrs[i].sh_info; | ||
2457 | |||
2458 | /* Not a valid relocation section? */ | ||
2459 | if (info >= hdr->e_shnum) | ||
2460 | continue; | ||
2461 | |||
2462 | /* Don't bother with non-allocated sections */ | ||
2463 | if (!(sechdrs[info].sh_flags & SHF_ALLOC)) | ||
2464 | continue; | ||
2465 | |||
2466 | if (sechdrs[i].sh_type == SHT_REL) | ||
2467 | err = apply_relocate(sechdrs, strtab, symindex, i,mod); | ||
2468 | else if (sechdrs[i].sh_type == SHT_RELA) | ||
2469 | err = apply_relocate_add(sechdrs, strtab, symindex, i, | ||
2470 | mod); | ||
2471 | if (err < 0) | ||
2472 | goto cleanup; | ||
2473 | } | ||
2474 | |||
2475 | /* Set up and sort exception table */ | ||
2476 | mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", | ||
2477 | sizeof(*mod->extable), &mod->num_exentries); | ||
2478 | sort_extable(mod->extable, mod->extable + mod->num_exentries); | ||
2479 | |||
2480 | /* Finally, copy percpu area over. */ | ||
2481 | percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, | ||
2482 | sechdrs[pcpuindex].sh_size); | ||
2483 | |||
2484 | add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, | ||
2485 | symoffs, stroffs, secstrings, strmap); | ||
2486 | kfree(strmap); | ||
2487 | strmap = NULL; | ||
2488 | |||
2489 | if (!mod->taints) | ||
2490 | debug = section_objs(hdr, sechdrs, secstrings, "__verbose", | ||
2491 | sizeof(*debug), &num_debug); | ||
2492 | |||
2493 | err = module_finalize(hdr, sechdrs, mod); | ||
2494 | if (err < 0) | ||
2495 | goto cleanup; | ||
2496 | 2434 | ||
2497 | /* flush the icache in correct context */ | 2435 | /* flush the icache in correct context */ |
2498 | old_fs = get_fs(); | 2436 | old_fs = get_fs(); |
@@ -2511,11 +2449,160 @@ static noinline struct module *load_module(void __user *umod, | |||
2511 | (unsigned long)mod->module_core + mod->core_size); | 2449 | (unsigned long)mod->module_core + mod->core_size); |
2512 | 2450 | ||
2513 | set_fs(old_fs); | 2451 | set_fs(old_fs); |
2452 | } | ||
2514 | 2453 | ||
2515 | mod->args = args; | 2454 | static struct module *layout_and_allocate(struct load_info *info) |
2516 | if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) | 2455 | { |
2517 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2456 | /* Module within temporary copy. */ |
2518 | mod->name); | 2457 | struct module *mod; |
2458 | Elf_Shdr *pcpusec; | ||
2459 | int err; | ||
2460 | |||
2461 | mod = setup_load_info(info); | ||
2462 | if (IS_ERR(mod)) | ||
2463 | return mod; | ||
2464 | |||
2465 | err = check_modinfo(mod, info); | ||
2466 | if (err) | ||
2467 | return ERR_PTR(err); | ||
2468 | |||
2469 | /* Allow arches to frob section contents and sizes. */ | ||
2470 | err = module_frob_arch_sections(info->hdr, info->sechdrs, | ||
2471 | info->secstrings, mod); | ||
2472 | if (err < 0) | ||
2473 | goto out; | ||
2474 | |||
2475 | pcpusec = &info->sechdrs[info->index.pcpu]; | ||
2476 | if (pcpusec->sh_size) { | ||
2477 | /* We have a special allocation for this section. */ | ||
2478 | err = percpu_modalloc(mod, | ||
2479 | pcpusec->sh_size, pcpusec->sh_addralign); | ||
2480 | if (err) | ||
2481 | goto out; | ||
2482 | pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
2483 | } | ||
2484 | |||
2485 | /* Determine total sizes, and put offsets in sh_entsize. For now | ||
2486 | this is done generically; there doesn't appear to be any | ||
2487 | special cases for the architectures. */ | ||
2488 | layout_sections(mod, info); | ||
2489 | |||
2490 | info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) | ||
2491 | * sizeof(long), GFP_KERNEL); | ||
2492 | if (!info->strmap) { | ||
2493 | err = -ENOMEM; | ||
2494 | goto free_percpu; | ||
2495 | } | ||
2496 | layout_symtab(mod, info); | ||
2497 | |||
2498 | /* Allocate and move to the final place */ | ||
2499 | err = move_module(mod, info); | ||
2500 | if (err) | ||
2501 | goto free_strmap; | ||
2502 | |||
2503 | /* Module has been copied to its final place now: return it. */ | ||
2504 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | ||
2505 | kmemleak_load_module(mod, info); | ||
2506 | return mod; | ||
2507 | |||
2508 | free_strmap: | ||
2509 | kfree(info->strmap); | ||
2510 | free_percpu: | ||
2511 | percpu_modfree(mod); | ||
2512 | out: | ||
2513 | return ERR_PTR(err); | ||
2514 | } | ||
2515 | |||
2516 | /* mod is no longer valid after this! */ | ||
2517 | static void module_deallocate(struct module *mod, struct load_info *info) | ||
2518 | { | ||
2519 | kfree(info->strmap); | ||
2520 | percpu_modfree(mod); | ||
2521 | module_free(mod, mod->module_init); | ||
2522 | module_free(mod, mod->module_core); | ||
2523 | } | ||
2524 | |||
2525 | static int post_relocation(struct module *mod, const struct load_info *info) | ||
2526 | { | ||
2527 | /* Sort exception table now relocations are done. */ | ||
2528 | sort_extable(mod->extable, mod->extable + mod->num_exentries); | ||
2529 | |||
2530 | /* Copy relocated percpu area over. */ | ||
2531 | percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, | ||
2532 | info->sechdrs[info->index.pcpu].sh_size); | ||
2533 | |||
2534 | /* Setup kallsyms-specific fields. */ | ||
2535 | add_kallsyms(mod, info); | ||
2536 | |||
2537 | /* Arch-specific module finalizing. */ | ||
2538 | return module_finalize(info->hdr, info->sechdrs, mod); | ||
2539 | } | ||
2540 | |||
2541 | /* Allocate and load the module: note that size of section 0 is always | ||
2542 | zero, and we rely on this for optional sections. */ | ||
2543 | static struct module *load_module(void __user *umod, | ||
2544 | unsigned long len, | ||
2545 | const char __user *uargs) | ||
2546 | { | ||
2547 | struct load_info info = { NULL, }; | ||
2548 | struct module *mod; | ||
2549 | long err; | ||
2550 | |||
2551 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | ||
2552 | umod, len, uargs); | ||
2553 | |||
2554 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | ||
2555 | err = copy_and_check(&info, umod, len, uargs); | ||
2556 | if (err) | ||
2557 | return ERR_PTR(err); | ||
2558 | |||
2559 | /* Figure out module layout, and allocate all the memory. */ | ||
2560 | mod = layout_and_allocate(&info); | ||
2561 | if (IS_ERR(mod)) { | ||
2562 | err = PTR_ERR(mod); | ||
2563 | goto free_copy; | ||
2564 | } | ||
2565 | |||
2566 | /* Now module is in final location, initialize linked lists, etc. */ | ||
2567 | err = module_unload_init(mod); | ||
2568 | if (err) | ||
2569 | goto free_module; | ||
2570 | |||
2571 | /* Now we've got everything in the final locations, we can | ||
2572 | * find optional sections. */ | ||
2573 | find_module_sections(mod, &info); | ||
2574 | |||
2575 | err = check_module_license_and_versions(mod); | ||
2576 | if (err) | ||
2577 | goto free_unload; | ||
2578 | |||
2579 | /* Set up MODINFO_ATTR fields */ | ||
2580 | setup_modinfo(mod, &info); | ||
2581 | |||
2582 | /* Fix up syms, so that st_value is a pointer to location. */ | ||
2583 | err = simplify_symbols(mod, &info); | ||
2584 | if (err < 0) | ||
2585 | goto free_modinfo; | ||
2586 | |||
2587 | err = apply_relocations(mod, &info); | ||
2588 | if (err < 0) | ||
2589 | goto free_modinfo; | ||
2590 | |||
2591 | err = post_relocation(mod, &info); | ||
2592 | if (err < 0) | ||
2593 | goto free_modinfo; | ||
2594 | |||
2595 | flush_module_icache(mod); | ||
2596 | |||
2597 | /* Now copy in args */ | ||
2598 | mod->args = strndup_user(uargs, ~0UL >> 1); | ||
2599 | if (IS_ERR(mod->args)) { | ||
2600 | err = PTR_ERR(mod->args); | ||
2601 | goto free_arch_cleanup; | ||
2602 | } | ||
2603 | |||
2604 | /* Mark state as coming so strong_try_module_get() ignores us. */ | ||
2605 | mod->state = MODULE_STATE_COMING; | ||
2519 | 2606 | ||
2520 | /* Now sew it into the lists so we can get lockdep and oops | 2607 | /* Now sew it into the lists so we can get lockdep and oops |
2521 | * info during argument parsing. Noone should access us, since | 2608 | * info during argument parsing. Noone should access us, since |
@@ -2530,70 +2617,61 @@ static noinline struct module *load_module(void __user *umod, | |||
2530 | goto unlock; | 2617 | goto unlock; |
2531 | } | 2618 | } |
2532 | 2619 | ||
2533 | if (debug) | 2620 | /* This has to be done once we're sure module name is unique. */ |
2534 | dynamic_debug_setup(debug, num_debug); | 2621 | if (!mod->taints) |
2622 | dynamic_debug_setup(info.debug, info.num_debug); | ||
2535 | 2623 | ||
2536 | /* Find duplicate symbols */ | 2624 | /* Find duplicate symbols */ |
2537 | err = verify_export_symbols(mod); | 2625 | err = verify_export_symbols(mod); |
2538 | if (err < 0) | 2626 | if (err < 0) |
2539 | goto ddebug; | 2627 | goto ddebug; |
2540 | 2628 | ||
2629 | module_bug_finalize(info.hdr, info.sechdrs, mod); | ||
2541 | list_add_rcu(&mod->list, &modules); | 2630 | list_add_rcu(&mod->list, &modules); |
2542 | mutex_unlock(&module_mutex); | 2631 | mutex_unlock(&module_mutex); |
2543 | 2632 | ||
2633 | /* Module is ready to execute: parsing args may do that. */ | ||
2544 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); | 2634 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); |
2545 | if (err < 0) | 2635 | if (err < 0) |
2546 | goto unlink; | 2636 | goto unlink; |
2547 | 2637 | ||
2548 | err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); | 2638 | /* Link in to syfs. */ |
2639 | err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); | ||
2549 | if (err < 0) | 2640 | if (err < 0) |
2550 | goto unlink; | 2641 | goto unlink; |
2551 | 2642 | ||
2552 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2643 | /* Get rid of temporary copy and strmap. */ |
2553 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2644 | kfree(info.strmap); |
2554 | 2645 | free_copy(&info); | |
2555 | /* Get rid of temporary copy */ | ||
2556 | vfree(hdr); | ||
2557 | |||
2558 | trace_module_load(mod); | ||
2559 | 2646 | ||
2560 | /* Done! */ | 2647 | /* Done! */ |
2648 | trace_module_load(mod); | ||
2561 | return mod; | 2649 | return mod; |
2562 | 2650 | ||
2563 | unlink: | 2651 | unlink: |
2564 | mutex_lock(&module_mutex); | 2652 | mutex_lock(&module_mutex); |
2565 | /* Unlink carefully: kallsyms could be walking list. */ | 2653 | /* Unlink carefully: kallsyms could be walking list. */ |
2566 | list_del_rcu(&mod->list); | 2654 | list_del_rcu(&mod->list); |
2655 | module_bug_cleanup(mod); | ||
2656 | |||
2567 | ddebug: | 2657 | ddebug: |
2568 | dynamic_debug_remove(debug); | 2658 | if (!mod->taints) |
2659 | dynamic_debug_remove(info.debug); | ||
2569 | unlock: | 2660 | unlock: |
2570 | mutex_unlock(&module_mutex); | 2661 | mutex_unlock(&module_mutex); |
2571 | synchronize_sched(); | 2662 | synchronize_sched(); |
2663 | kfree(mod->args); | ||
2664 | free_arch_cleanup: | ||
2572 | module_arch_cleanup(mod); | 2665 | module_arch_cleanup(mod); |
2573 | cleanup: | 2666 | free_modinfo: |
2574 | free_modinfo(mod); | 2667 | free_modinfo(mod); |
2668 | free_unload: | ||
2575 | module_unload_free(mod); | 2669 | module_unload_free(mod); |
2576 | #if defined(CONFIG_MODULE_UNLOAD) | 2670 | free_module: |
2577 | free_percpu(mod->refptr); | 2671 | module_deallocate(mod, &info); |
2578 | free_init: | 2672 | free_copy: |
2579 | #endif | 2673 | free_copy(&info); |
2580 | module_free(mod, mod->module_init); | ||
2581 | free_core: | ||
2582 | module_free(mod, mod->module_core); | ||
2583 | /* mod will be freed with core. Don't access it beyond this line! */ | ||
2584 | free_percpu: | ||
2585 | free_percpu(percpu); | ||
2586 | free_mod: | ||
2587 | kfree(args); | ||
2588 | kfree(strmap); | ||
2589 | free_hdr: | ||
2590 | vfree(hdr); | ||
2591 | return ERR_PTR(err); | 2674 | return ERR_PTR(err); |
2592 | |||
2593 | truncated: | ||
2594 | printk(KERN_ERR "Module len %lu truncated\n", len); | ||
2595 | err = -ENOEXEC; | ||
2596 | goto free_hdr; | ||
2597 | } | 2675 | } |
2598 | 2676 | ||
2599 | /* Call module constructors. */ | 2677 | /* Call module constructors. */ |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3e6d2e..200407c1502f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -36,15 +36,6 @@ | |||
36 | # include <asm/mutex.h> | 36 | # include <asm/mutex.h> |
37 | #endif | 37 | #endif |
38 | 38 | ||
39 | /*** | ||
40 | * mutex_init - initialize the mutex | ||
41 | * @lock: the mutex to be initialized | ||
42 | * @key: the lock_class_key for the class; used by mutex lock debugging | ||
43 | * | ||
44 | * Initialize the mutex to unlocked state. | ||
45 | * | ||
46 | * It is not allowed to initialize an already locked mutex. | ||
47 | */ | ||
48 | void | 39 | void |
49 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | 40 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) |
50 | { | 41 | { |
@@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); | |||
68 | static __used noinline void __sched | 59 | static __used noinline void __sched |
69 | __mutex_lock_slowpath(atomic_t *lock_count); | 60 | __mutex_lock_slowpath(atomic_t *lock_count); |
70 | 61 | ||
71 | /*** | 62 | /** |
72 | * mutex_lock - acquire the mutex | 63 | * mutex_lock - acquire the mutex |
73 | * @lock: the mutex to be acquired | 64 | * @lock: the mutex to be acquired |
74 | * | 65 | * |
@@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); | |||
105 | 96 | ||
106 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); | 97 | static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); |
107 | 98 | ||
108 | /*** | 99 | /** |
109 | * mutex_unlock - release the mutex | 100 | * mutex_unlock - release the mutex |
110 | * @lock: the mutex to be released | 101 | * @lock: the mutex to be released |
111 | * | 102 | * |
@@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); | |||
364 | static noinline int __sched | 355 | static noinline int __sched |
365 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); | 356 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
366 | 357 | ||
367 | /*** | 358 | /** |
368 | * mutex_lock_interruptible - acquire the mutex, interruptable | 359 | * mutex_lock_interruptible - acquire the mutex, interruptible |
369 | * @lock: the mutex to be acquired | 360 | * @lock: the mutex to be acquired |
370 | * | 361 | * |
371 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has | 362 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has |
@@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
456 | return prev == 1; | 447 | return prev == 1; |
457 | } | 448 | } |
458 | 449 | ||
459 | /*** | 450 | /** |
460 | * mutex_trylock - try acquire the mutex, without waiting | 451 | * mutex_trylock - try to acquire the mutex, without waiting |
461 | * @lock: the mutex to be acquired | 452 | * @lock: the mutex to be acquired |
462 | * | 453 | * |
463 | * Try to acquire the mutex atomically. Returns 1 if the mutex | 454 | * Try to acquire the mutex atomically. Returns 1 if the mutex |
464 | * has been acquired successfully, and 0 on contention. | 455 | * has been acquired successfully, and 0 on contention. |
465 | * | 456 | * |
466 | * NOTE: this function follows the spin_trylock() convention, so | 457 | * NOTE: this function follows the spin_trylock() convention, so |
467 | * it is negated to the down_trylock() return values! Be careful | 458 | * it is negated from the down_trylock() return values! Be careful |
468 | * about this when converting semaphore users to mutexes. | 459 | * about this when converting semaphore users to mutexes. |
469 | * | 460 | * |
470 | * This function must not be used in interrupt context. The | 461 | * This function must not be used in interrupt context. The |
diff --git a/kernel/panic.c b/kernel/panic.c index 3b16cd93fa7d..4c13b1a88ebb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -24,6 +24,9 @@ | |||
24 | #include <linux/nmi.h> | 24 | #include <linux/nmi.h> |
25 | #include <linux/dmi.h> | 25 | #include <linux/dmi.h> |
26 | 26 | ||
27 | #define PANIC_TIMER_STEP 100 | ||
28 | #define PANIC_BLINK_SPD 18 | ||
29 | |||
27 | int panic_on_oops; | 30 | int panic_on_oops; |
28 | static unsigned long tainted_mask; | 31 | static unsigned long tainted_mask; |
29 | static int pause_on_oops; | 32 | static int pause_on_oops; |
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | |||
36 | 39 | ||
37 | EXPORT_SYMBOL(panic_notifier_list); | 40 | EXPORT_SYMBOL(panic_notifier_list); |
38 | 41 | ||
39 | /* Returns how long it waited in ms */ | 42 | static long no_blink(int state) |
40 | long (*panic_blink)(long time); | ||
41 | EXPORT_SYMBOL(panic_blink); | ||
42 | |||
43 | static void panic_blink_one_second(void) | ||
44 | { | 43 | { |
45 | static long i = 0, end; | 44 | return 0; |
46 | |||
47 | if (panic_blink) { | ||
48 | end = i + MSEC_PER_SEC; | ||
49 | |||
50 | while (i < end) { | ||
51 | i += panic_blink(i); | ||
52 | mdelay(1); | ||
53 | i++; | ||
54 | } | ||
55 | } else { | ||
56 | /* | ||
57 | * When running under a hypervisor a small mdelay may get | ||
58 | * rounded up to the hypervisor timeslice. For example, with | ||
59 | * a 1ms in 10ms hypervisor timeslice we might inflate a | ||
60 | * mdelay(1) loop by 10x. | ||
61 | * | ||
62 | * If we have nothing to blink, spin on 1 second calls to | ||
63 | * mdelay to avoid this. | ||
64 | */ | ||
65 | mdelay(MSEC_PER_SEC); | ||
66 | } | ||
67 | } | 45 | } |
68 | 46 | ||
47 | /* Returns how long it waited in ms */ | ||
48 | long (*panic_blink)(int state); | ||
49 | EXPORT_SYMBOL(panic_blink); | ||
50 | |||
69 | /** | 51 | /** |
70 | * panic - halt the system | 52 | * panic - halt the system |
71 | * @fmt: The text string to print | 53 | * @fmt: The text string to print |
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
78 | { | 60 | { |
79 | static char buf[1024]; | 61 | static char buf[1024]; |
80 | va_list args; | 62 | va_list args; |
81 | long i; | 63 | long i, i_next = 0; |
64 | int state = 0; | ||
82 | 65 | ||
83 | /* | 66 | /* |
84 | * It's possible to come here directly from a panic-assertion and | 67 | * It's possible to come here directly from a panic-assertion and |
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
117 | 100 | ||
118 | bust_spinlocks(0); | 101 | bust_spinlocks(0); |
119 | 102 | ||
103 | if (!panic_blink) | ||
104 | panic_blink = no_blink; | ||
105 | |||
120 | if (panic_timeout > 0) { | 106 | if (panic_timeout > 0) { |
121 | /* | 107 | /* |
122 | * Delay timeout seconds before rebooting the machine. | 108 | * Delay timeout seconds before rebooting the machine. |
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
124 | */ | 110 | */ |
125 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); | 111 | printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); |
126 | 112 | ||
127 | for (i = 0; i < panic_timeout; i++) { | 113 | for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { |
128 | touch_nmi_watchdog(); | 114 | touch_nmi_watchdog(); |
129 | panic_blink_one_second(); | 115 | if (i >= i_next) { |
116 | i += panic_blink(state ^= 1); | ||
117 | i_next = i + 3600 / PANIC_BLINK_SPD; | ||
118 | } | ||
119 | mdelay(PANIC_TIMER_STEP); | ||
130 | } | 120 | } |
131 | /* | 121 | /* |
132 | * This will not be a clean reboot, with everything | 122 | * This will not be a clean reboot, with everything |
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
152 | } | 142 | } |
153 | #endif | 143 | #endif |
154 | local_irq_enable(); | 144 | local_irq_enable(); |
155 | while (1) { | 145 | for (i = 0; ; i += PANIC_TIMER_STEP) { |
156 | touch_softlockup_watchdog(); | 146 | touch_softlockup_watchdog(); |
157 | panic_blink_one_second(); | 147 | if (i >= i_next) { |
148 | i += panic_blink(state ^= 1); | ||
149 | i_next = i + 3600 / PANIC_BLINK_SPD; | ||
150 | } | ||
151 | mdelay(PANIC_TIMER_STEP); | ||
158 | } | 152 | } |
159 | } | 153 | } |
160 | 154 | ||
@@ -344,7 +338,7 @@ static int init_oops_id(void) | |||
344 | } | 338 | } |
345 | late_initcall(init_oops_id); | 339 | late_initcall(init_oops_id); |
346 | 340 | ||
347 | static void print_oops_end_marker(void) | 341 | void print_oops_end_marker(void) |
348 | { | 342 | { |
349 | init_oops_id(); | 343 | init_oops_id(); |
350 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | 344 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", |
diff --git a/kernel/params.c b/kernel/params.c index 0b30ecd53a52..08107d181758 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -31,6 +31,42 @@ | |||
31 | #define DEBUGP(fmt, a...) | 31 | #define DEBUGP(fmt, a...) |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | /* Protects all parameters, and incidentally kmalloced_param list. */ | ||
35 | static DEFINE_MUTEX(param_lock); | ||
36 | |||
37 | /* This just allows us to keep track of which parameters are kmalloced. */ | ||
38 | struct kmalloced_param { | ||
39 | struct list_head list; | ||
40 | char val[]; | ||
41 | }; | ||
42 | static LIST_HEAD(kmalloced_params); | ||
43 | |||
44 | static void *kmalloc_parameter(unsigned int size) | ||
45 | { | ||
46 | struct kmalloced_param *p; | ||
47 | |||
48 | p = kmalloc(sizeof(*p) + size, GFP_KERNEL); | ||
49 | if (!p) | ||
50 | return NULL; | ||
51 | |||
52 | list_add(&p->list, &kmalloced_params); | ||
53 | return p->val; | ||
54 | } | ||
55 | |||
56 | /* Does nothing if parameter wasn't kmalloced above. */ | ||
57 | static void maybe_kfree_parameter(void *param) | ||
58 | { | ||
59 | struct kmalloced_param *p; | ||
60 | |||
61 | list_for_each_entry(p, &kmalloced_params, list) { | ||
62 | if (p->val == param) { | ||
63 | list_del(&p->list); | ||
64 | kfree(p); | ||
65 | break; | ||
66 | } | ||
67 | } | ||
68 | } | ||
69 | |||
34 | static inline char dash2underscore(char c) | 70 | static inline char dash2underscore(char c) |
35 | { | 71 | { |
36 | if (c == '-') | 72 | if (c == '-') |
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname) | |||
49 | 85 | ||
50 | static int parse_one(char *param, | 86 | static int parse_one(char *param, |
51 | char *val, | 87 | char *val, |
52 | struct kernel_param *params, | 88 | const struct kernel_param *params, |
53 | unsigned num_params, | 89 | unsigned num_params, |
54 | int (*handle_unknown)(char *param, char *val)) | 90 | int (*handle_unknown)(char *param, char *val)) |
55 | { | 91 | { |
56 | unsigned int i; | 92 | unsigned int i; |
93 | int err; | ||
57 | 94 | ||
58 | /* Find parameter */ | 95 | /* Find parameter */ |
59 | for (i = 0; i < num_params; i++) { | 96 | for (i = 0; i < num_params; i++) { |
60 | if (parameq(param, params[i].name)) { | 97 | if (parameq(param, params[i].name)) { |
98 | /* Noone handled NULL, so do it here. */ | ||
99 | if (!val && params[i].ops->set != param_set_bool) | ||
100 | return -EINVAL; | ||
61 | DEBUGP("They are equal! Calling %p\n", | 101 | DEBUGP("They are equal! Calling %p\n", |
62 | params[i].set); | 102 | params[i].ops->set); |
63 | return params[i].set(val, ¶ms[i]); | 103 | mutex_lock(¶m_lock); |
104 | err = params[i].ops->set(val, ¶ms[i]); | ||
105 | mutex_unlock(¶m_lock); | ||
106 | return err; | ||
64 | } | 107 | } |
65 | } | 108 | } |
66 | 109 | ||
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val) | |||
128 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 171 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
129 | int parse_args(const char *name, | 172 | int parse_args(const char *name, |
130 | char *args, | 173 | char *args, |
131 | struct kernel_param *params, | 174 | const struct kernel_param *params, |
132 | unsigned num, | 175 | unsigned num, |
133 | int (*unknown)(char *param, char *val)) | 176 | int (*unknown)(char *param, char *val)) |
134 | { | 177 | { |
@@ -176,22 +219,29 @@ int parse_args(const char *name, | |||
176 | 219 | ||
177 | /* Lazy bastard, eh? */ | 220 | /* Lazy bastard, eh? */ |
178 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ | 221 | #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ |
179 | int param_set_##name(const char *val, struct kernel_param *kp) \ | 222 | int param_set_##name(const char *val, const struct kernel_param *kp) \ |
180 | { \ | 223 | { \ |
181 | tmptype l; \ | 224 | tmptype l; \ |
182 | int ret; \ | 225 | int ret; \ |
183 | \ | 226 | \ |
184 | if (!val) return -EINVAL; \ | ||
185 | ret = strtolfn(val, 0, &l); \ | 227 | ret = strtolfn(val, 0, &l); \ |
186 | if (ret == -EINVAL || ((type)l != l)) \ | 228 | if (ret == -EINVAL || ((type)l != l)) \ |
187 | return -EINVAL; \ | 229 | return -EINVAL; \ |
188 | *((type *)kp->arg) = l; \ | 230 | *((type *)kp->arg) = l; \ |
189 | return 0; \ | 231 | return 0; \ |
190 | } \ | 232 | } \ |
191 | int param_get_##name(char *buffer, struct kernel_param *kp) \ | 233 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
192 | { \ | 234 | { \ |
193 | return sprintf(buffer, format, *((type *)kp->arg)); \ | 235 | return sprintf(buffer, format, *((type *)kp->arg)); \ |
194 | } | 236 | } \ |
237 | struct kernel_param_ops param_ops_##name = { \ | ||
238 | .set = param_set_##name, \ | ||
239 | .get = param_get_##name, \ | ||
240 | }; \ | ||
241 | EXPORT_SYMBOL(param_set_##name); \ | ||
242 | EXPORT_SYMBOL(param_get_##name); \ | ||
243 | EXPORT_SYMBOL(param_ops_##name) | ||
244 | |||
195 | 245 | ||
196 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); | 246 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); |
197 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 247 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); |
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | |||
201 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 251 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); |
202 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 252 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); |
203 | 253 | ||
204 | int param_set_charp(const char *val, struct kernel_param *kp) | 254 | int param_set_charp(const char *val, const struct kernel_param *kp) |
205 | { | 255 | { |
206 | if (!val) { | ||
207 | printk(KERN_ERR "%s: string parameter expected\n", | ||
208 | kp->name); | ||
209 | return -EINVAL; | ||
210 | } | ||
211 | |||
212 | if (strlen(val) > 1024) { | 256 | if (strlen(val) > 1024) { |
213 | printk(KERN_ERR "%s: string parameter too long\n", | 257 | printk(KERN_ERR "%s: string parameter too long\n", |
214 | kp->name); | 258 | kp->name); |
215 | return -ENOSPC; | 259 | return -ENOSPC; |
216 | } | 260 | } |
217 | 261 | ||
218 | /* This is a hack. We can't need to strdup in early boot, and we | 262 | maybe_kfree_parameter(*(char **)kp->arg); |
263 | |||
264 | /* This is a hack. We can't kmalloc in early boot, and we | ||
219 | * don't need to; this mangled commandline is preserved. */ | 265 | * don't need to; this mangled commandline is preserved. */ |
220 | if (slab_is_available()) { | 266 | if (slab_is_available()) { |
221 | *(char **)kp->arg = kstrdup(val, GFP_KERNEL); | 267 | *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); |
222 | if (!*(char **)kp->arg) | 268 | if (!*(char **)kp->arg) |
223 | return -ENOMEM; | 269 | return -ENOMEM; |
270 | strcpy(*(char **)kp->arg, val); | ||
224 | } else | 271 | } else |
225 | *(const char **)kp->arg = val; | 272 | *(const char **)kp->arg = val; |
226 | 273 | ||
227 | return 0; | 274 | return 0; |
228 | } | 275 | } |
276 | EXPORT_SYMBOL(param_set_charp); | ||
229 | 277 | ||
230 | int param_get_charp(char *buffer, struct kernel_param *kp) | 278 | int param_get_charp(char *buffer, const struct kernel_param *kp) |
231 | { | 279 | { |
232 | return sprintf(buffer, "%s", *((char **)kp->arg)); | 280 | return sprintf(buffer, "%s", *((char **)kp->arg)); |
233 | } | 281 | } |
282 | EXPORT_SYMBOL(param_get_charp); | ||
283 | |||
284 | static void param_free_charp(void *arg) | ||
285 | { | ||
286 | maybe_kfree_parameter(*((char **)arg)); | ||
287 | } | ||
288 | |||
289 | struct kernel_param_ops param_ops_charp = { | ||
290 | .set = param_set_charp, | ||
291 | .get = param_get_charp, | ||
292 | .free = param_free_charp, | ||
293 | }; | ||
294 | EXPORT_SYMBOL(param_ops_charp); | ||
234 | 295 | ||
235 | /* Actually could be a bool or an int, for historical reasons. */ | 296 | /* Actually could be a bool or an int, for historical reasons. */ |
236 | int param_set_bool(const char *val, struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
237 | { | 298 | { |
238 | bool v; | 299 | bool v; |
239 | 300 | ||
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp) | |||
258 | *(int *)kp->arg = v; | 319 | *(int *)kp->arg = v; |
259 | return 0; | 320 | return 0; |
260 | } | 321 | } |
322 | EXPORT_SYMBOL(param_set_bool); | ||
261 | 323 | ||
262 | int param_get_bool(char *buffer, struct kernel_param *kp) | 324 | int param_get_bool(char *buffer, const struct kernel_param *kp) |
263 | { | 325 | { |
264 | bool val; | 326 | bool val; |
265 | if (kp->flags & KPARAM_ISBOOL) | 327 | if (kp->flags & KPARAM_ISBOOL) |
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp) | |||
270 | /* Y and N chosen as being relatively non-coder friendly */ | 332 | /* Y and N chosen as being relatively non-coder friendly */ |
271 | return sprintf(buffer, "%c", val ? 'Y' : 'N'); | 333 | return sprintf(buffer, "%c", val ? 'Y' : 'N'); |
272 | } | 334 | } |
335 | EXPORT_SYMBOL(param_get_bool); | ||
336 | |||
337 | struct kernel_param_ops param_ops_bool = { | ||
338 | .set = param_set_bool, | ||
339 | .get = param_get_bool, | ||
340 | }; | ||
341 | EXPORT_SYMBOL(param_ops_bool); | ||
273 | 342 | ||
274 | /* This one must be bool. */ | 343 | /* This one must be bool. */ |
275 | int param_set_invbool(const char *val, struct kernel_param *kp) | 344 | int param_set_invbool(const char *val, const struct kernel_param *kp) |
276 | { | 345 | { |
277 | int ret; | 346 | int ret; |
278 | bool boolval; | 347 | bool boolval; |
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp) | |||
285 | *(bool *)kp->arg = !boolval; | 354 | *(bool *)kp->arg = !boolval; |
286 | return ret; | 355 | return ret; |
287 | } | 356 | } |
357 | EXPORT_SYMBOL(param_set_invbool); | ||
288 | 358 | ||
289 | int param_get_invbool(char *buffer, struct kernel_param *kp) | 359 | int param_get_invbool(char *buffer, const struct kernel_param *kp) |
290 | { | 360 | { |
291 | return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); | 361 | return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); |
292 | } | 362 | } |
363 | EXPORT_SYMBOL(param_get_invbool); | ||
364 | |||
365 | struct kernel_param_ops param_ops_invbool = { | ||
366 | .set = param_set_invbool, | ||
367 | .get = param_get_invbool, | ||
368 | }; | ||
369 | EXPORT_SYMBOL(param_ops_invbool); | ||
293 | 370 | ||
294 | /* We break the rule and mangle the string. */ | 371 | /* We break the rule and mangle the string. */ |
295 | static int param_array(const char *name, | 372 | static int param_array(const char *name, |
296 | const char *val, | 373 | const char *val, |
297 | unsigned int min, unsigned int max, | 374 | unsigned int min, unsigned int max, |
298 | void *elem, int elemsize, | 375 | void *elem, int elemsize, |
299 | int (*set)(const char *, struct kernel_param *kp), | 376 | int (*set)(const char *, const struct kernel_param *kp), |
300 | u16 flags, | 377 | u16 flags, |
301 | unsigned int *num) | 378 | unsigned int *num) |
302 | { | 379 | { |
@@ -309,12 +386,6 @@ static int param_array(const char *name, | |||
309 | kp.arg = elem; | 386 | kp.arg = elem; |
310 | kp.flags = flags; | 387 | kp.flags = flags; |
311 | 388 | ||
312 | /* No equals sign? */ | ||
313 | if (!val) { | ||
314 | printk(KERN_ERR "%s: expects arguments\n", name); | ||
315 | return -EINVAL; | ||
316 | } | ||
317 | |||
318 | *num = 0; | 389 | *num = 0; |
319 | /* We expect a comma-separated list of values. */ | 390 | /* We expect a comma-separated list of values. */ |
320 | do { | 391 | do { |
@@ -330,6 +401,7 @@ static int param_array(const char *name, | |||
330 | /* nul-terminate and parse */ | 401 | /* nul-terminate and parse */ |
331 | save = val[len]; | 402 | save = val[len]; |
332 | ((char *)val)[len] = '\0'; | 403 | ((char *)val)[len] = '\0'; |
404 | BUG_ON(!mutex_is_locked(¶m_lock)); | ||
333 | ret = set(val, &kp); | 405 | ret = set(val, &kp); |
334 | 406 | ||
335 | if (ret != 0) | 407 | if (ret != 0) |
@@ -347,17 +419,17 @@ static int param_array(const char *name, | |||
347 | return 0; | 419 | return 0; |
348 | } | 420 | } |
349 | 421 | ||
350 | int param_array_set(const char *val, struct kernel_param *kp) | 422 | static int param_array_set(const char *val, const struct kernel_param *kp) |
351 | { | 423 | { |
352 | const struct kparam_array *arr = kp->arr; | 424 | const struct kparam_array *arr = kp->arr; |
353 | unsigned int temp_num; | 425 | unsigned int temp_num; |
354 | 426 | ||
355 | return param_array(kp->name, val, 1, arr->max, arr->elem, | 427 | return param_array(kp->name, val, 1, arr->max, arr->elem, |
356 | arr->elemsize, arr->set, kp->flags, | 428 | arr->elemsize, arr->ops->set, kp->flags, |
357 | arr->num ?: &temp_num); | 429 | arr->num ?: &temp_num); |
358 | } | 430 | } |
359 | 431 | ||
360 | int param_array_get(char *buffer, struct kernel_param *kp) | 432 | static int param_array_get(char *buffer, const struct kernel_param *kp) |
361 | { | 433 | { |
362 | int i, off, ret; | 434 | int i, off, ret; |
363 | const struct kparam_array *arr = kp->arr; | 435 | const struct kparam_array *arr = kp->arr; |
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp) | |||
368 | if (i) | 440 | if (i) |
369 | buffer[off++] = ','; | 441 | buffer[off++] = ','; |
370 | p.arg = arr->elem + arr->elemsize * i; | 442 | p.arg = arr->elem + arr->elemsize * i; |
371 | ret = arr->get(buffer + off, &p); | 443 | BUG_ON(!mutex_is_locked(¶m_lock)); |
444 | ret = arr->ops->get(buffer + off, &p); | ||
372 | if (ret < 0) | 445 | if (ret < 0) |
373 | return ret; | 446 | return ret; |
374 | off += ret; | 447 | off += ret; |
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp) | |||
377 | return off; | 450 | return off; |
378 | } | 451 | } |
379 | 452 | ||
380 | int param_set_copystring(const char *val, struct kernel_param *kp) | 453 | static void param_array_free(void *arg) |
454 | { | ||
455 | unsigned int i; | ||
456 | const struct kparam_array *arr = arg; | ||
457 | |||
458 | if (arr->ops->free) | ||
459 | for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) | ||
460 | arr->ops->free(arr->elem + arr->elemsize * i); | ||
461 | } | ||
462 | |||
463 | struct kernel_param_ops param_array_ops = { | ||
464 | .set = param_array_set, | ||
465 | .get = param_array_get, | ||
466 | .free = param_array_free, | ||
467 | }; | ||
468 | EXPORT_SYMBOL(param_array_ops); | ||
469 | |||
470 | int param_set_copystring(const char *val, const struct kernel_param *kp) | ||
381 | { | 471 | { |
382 | const struct kparam_string *kps = kp->str; | 472 | const struct kparam_string *kps = kp->str; |
383 | 473 | ||
384 | if (!val) { | ||
385 | printk(KERN_ERR "%s: missing param set value\n", kp->name); | ||
386 | return -EINVAL; | ||
387 | } | ||
388 | if (strlen(val)+1 > kps->maxlen) { | 474 | if (strlen(val)+1 > kps->maxlen) { |
389 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", | 475 | printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", |
390 | kp->name, kps->maxlen-1); | 476 | kp->name, kps->maxlen-1); |
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp) | |||
393 | strcpy(kps->string, val); | 479 | strcpy(kps->string, val); |
394 | return 0; | 480 | return 0; |
395 | } | 481 | } |
482 | EXPORT_SYMBOL(param_set_copystring); | ||
396 | 483 | ||
397 | int param_get_string(char *buffer, struct kernel_param *kp) | 484 | int param_get_string(char *buffer, const struct kernel_param *kp) |
398 | { | 485 | { |
399 | const struct kparam_string *kps = kp->str; | 486 | const struct kparam_string *kps = kp->str; |
400 | return strlcpy(buffer, kps->string, kps->maxlen); | 487 | return strlcpy(buffer, kps->string, kps->maxlen); |
401 | } | 488 | } |
489 | EXPORT_SYMBOL(param_get_string); | ||
490 | |||
491 | struct kernel_param_ops param_ops_string = { | ||
492 | .set = param_set_copystring, | ||
493 | .get = param_get_string, | ||
494 | }; | ||
495 | EXPORT_SYMBOL(param_ops_string); | ||
402 | 496 | ||
403 | /* sysfs output in /sys/modules/XYZ/parameters/ */ | 497 | /* sysfs output in /sys/modules/XYZ/parameters/ */ |
404 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) | 498 | #define to_module_attr(n) container_of(n, struct module_attribute, attr) |
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[]; | |||
409 | struct param_attribute | 503 | struct param_attribute |
410 | { | 504 | { |
411 | struct module_attribute mattr; | 505 | struct module_attribute mattr; |
412 | struct kernel_param *param; | 506 | const struct kernel_param *param; |
413 | }; | 507 | }; |
414 | 508 | ||
415 | struct module_param_attrs | 509 | struct module_param_attrs |
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, | |||
428 | int count; | 522 | int count; |
429 | struct param_attribute *attribute = to_param_attr(mattr); | 523 | struct param_attribute *attribute = to_param_attr(mattr); |
430 | 524 | ||
431 | if (!attribute->param->get) | 525 | if (!attribute->param->ops->get) |
432 | return -EPERM; | 526 | return -EPERM; |
433 | 527 | ||
434 | count = attribute->param->get(buf, attribute->param); | 528 | mutex_lock(¶m_lock); |
529 | count = attribute->param->ops->get(buf, attribute->param); | ||
530 | mutex_unlock(¶m_lock); | ||
435 | if (count > 0) { | 531 | if (count > 0) { |
436 | strcat(buf, "\n"); | 532 | strcat(buf, "\n"); |
437 | ++count; | 533 | ++count; |
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
447 | int err; | 543 | int err; |
448 | struct param_attribute *attribute = to_param_attr(mattr); | 544 | struct param_attribute *attribute = to_param_attr(mattr); |
449 | 545 | ||
450 | if (!attribute->param->set) | 546 | if (!attribute->param->ops->set) |
451 | return -EPERM; | 547 | return -EPERM; |
452 | 548 | ||
453 | err = attribute->param->set(buf, attribute->param); | 549 | mutex_lock(¶m_lock); |
550 | err = attribute->param->ops->set(buf, attribute->param); | ||
551 | mutex_unlock(¶m_lock); | ||
454 | if (!err) | 552 | if (!err) |
455 | return len; | 553 | return len; |
456 | return err; | 554 | return err; |
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
464 | #endif | 562 | #endif |
465 | 563 | ||
466 | #ifdef CONFIG_SYSFS | 564 | #ifdef CONFIG_SYSFS |
565 | void __kernel_param_lock(void) | ||
566 | { | ||
567 | mutex_lock(¶m_lock); | ||
568 | } | ||
569 | EXPORT_SYMBOL(__kernel_param_lock); | ||
570 | |||
571 | void __kernel_param_unlock(void) | ||
572 | { | ||
573 | mutex_unlock(¶m_lock); | ||
574 | } | ||
575 | EXPORT_SYMBOL(__kernel_param_unlock); | ||
576 | |||
467 | /* | 577 | /* |
468 | * add_sysfs_param - add a parameter to sysfs | 578 | * add_sysfs_param - add a parameter to sysfs |
469 | * @mk: struct module_kobject | 579 | * @mk: struct module_kobject |
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, | |||
475 | * if there's an error. | 585 | * if there's an error. |
476 | */ | 586 | */ |
477 | static __modinit int add_sysfs_param(struct module_kobject *mk, | 587 | static __modinit int add_sysfs_param(struct module_kobject *mk, |
478 | struct kernel_param *kp, | 588 | const struct kernel_param *kp, |
479 | const char *name) | 589 | const char *name) |
480 | { | 590 | { |
481 | struct module_param_attrs *new; | 591 | struct module_param_attrs *new; |
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk) | |||
557 | * /sys/module/[mod->name]/parameters/ | 667 | * /sys/module/[mod->name]/parameters/ |
558 | */ | 668 | */ |
559 | int module_param_sysfs_setup(struct module *mod, | 669 | int module_param_sysfs_setup(struct module *mod, |
560 | struct kernel_param *kparam, | 670 | const struct kernel_param *kparam, |
561 | unsigned int num_params) | 671 | unsigned int num_params) |
562 | { | 672 | { |
563 | int i, err; | 673 | int i, err; |
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod) | |||
602 | 712 | ||
603 | void destroy_params(const struct kernel_param *params, unsigned num) | 713 | void destroy_params(const struct kernel_param *params, unsigned num) |
604 | { | 714 | { |
605 | /* FIXME: This should free kmalloced charp parameters. It doesn't. */ | 715 | unsigned int i; |
716 | |||
717 | for (i = 0; i < num; i++) | ||
718 | if (params[i].ops->free) | ||
719 | params[i].ops->free(params[i].arg); | ||
606 | } | 720 | } |
607 | 721 | ||
608 | static void __init kernel_add_sysfs_param(const char *name, | 722 | static void __init kernel_add_sysfs_param(const char *name, |
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void) | |||
768 | subsys_initcall(param_sysfs_init); | 882 | subsys_initcall(param_sysfs_init); |
769 | 883 | ||
770 | #endif /* CONFIG_SYSFS */ | 884 | #endif /* CONFIG_SYSFS */ |
771 | |||
772 | EXPORT_SYMBOL(param_set_byte); | ||
773 | EXPORT_SYMBOL(param_get_byte); | ||
774 | EXPORT_SYMBOL(param_set_short); | ||
775 | EXPORT_SYMBOL(param_get_short); | ||
776 | EXPORT_SYMBOL(param_set_ushort); | ||
777 | EXPORT_SYMBOL(param_get_ushort); | ||
778 | EXPORT_SYMBOL(param_set_int); | ||
779 | EXPORT_SYMBOL(param_get_int); | ||
780 | EXPORT_SYMBOL(param_set_uint); | ||
781 | EXPORT_SYMBOL(param_get_uint); | ||
782 | EXPORT_SYMBOL(param_set_long); | ||
783 | EXPORT_SYMBOL(param_get_long); | ||
784 | EXPORT_SYMBOL(param_set_ulong); | ||
785 | EXPORT_SYMBOL(param_get_ulong); | ||
786 | EXPORT_SYMBOL(param_set_charp); | ||
787 | EXPORT_SYMBOL(param_get_charp); | ||
788 | EXPORT_SYMBOL(param_set_bool); | ||
789 | EXPORT_SYMBOL(param_get_bool); | ||
790 | EXPORT_SYMBOL(param_set_invbool); | ||
791 | EXPORT_SYMBOL(param_get_invbool); | ||
792 | EXPORT_SYMBOL(param_array_set); | ||
793 | EXPORT_SYMBOL(param_array_get); | ||
794 | EXPORT_SYMBOL(param_set_copystring); | ||
795 | EXPORT_SYMBOL(param_get_string); | ||
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4c..db5b56064687 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
214 | 214 | ||
215 | static inline u64 perf_clock(void) | 215 | static inline u64 perf_clock(void) |
216 | { | 216 | { |
217 | return cpu_clock(raw_smp_processor_id()); | 217 | return local_clock(); |
218 | } | 218 | } |
219 | 219 | ||
220 | /* | 220 | /* |
@@ -402,11 +402,31 @@ static void perf_group_detach(struct perf_event *event) | |||
402 | } | 402 | } |
403 | } | 403 | } |
404 | 404 | ||
405 | static inline int | ||
406 | event_filter_match(struct perf_event *event) | ||
407 | { | ||
408 | return event->cpu == -1 || event->cpu == smp_processor_id(); | ||
409 | } | ||
410 | |||
405 | static void | 411 | static void |
406 | event_sched_out(struct perf_event *event, | 412 | event_sched_out(struct perf_event *event, |
407 | struct perf_cpu_context *cpuctx, | 413 | struct perf_cpu_context *cpuctx, |
408 | struct perf_event_context *ctx) | 414 | struct perf_event_context *ctx) |
409 | { | 415 | { |
416 | u64 delta; | ||
417 | /* | ||
418 | * An event which could not be activated because of | ||
419 | * filter mismatch still needs to have its timings | ||
420 | * maintained, otherwise bogus information is return | ||
421 | * via read() for time_enabled, time_running: | ||
422 | */ | ||
423 | if (event->state == PERF_EVENT_STATE_INACTIVE | ||
424 | && !event_filter_match(event)) { | ||
425 | delta = ctx->time - event->tstamp_stopped; | ||
426 | event->tstamp_running += delta; | ||
427 | event->tstamp_stopped = ctx->time; | ||
428 | } | ||
429 | |||
410 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 430 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
411 | return; | 431 | return; |
412 | 432 | ||
@@ -432,9 +452,7 @@ group_sched_out(struct perf_event *group_event, | |||
432 | struct perf_event_context *ctx) | 452 | struct perf_event_context *ctx) |
433 | { | 453 | { |
434 | struct perf_event *event; | 454 | struct perf_event *event; |
435 | 455 | int state = group_event->state; | |
436 | if (group_event->state != PERF_EVENT_STATE_ACTIVE) | ||
437 | return; | ||
438 | 456 | ||
439 | event_sched_out(group_event, cpuctx, ctx); | 457 | event_sched_out(group_event, cpuctx, ctx); |
440 | 458 | ||
@@ -444,7 +462,7 @@ group_sched_out(struct perf_event *group_event, | |||
444 | list_for_each_entry(event, &group_event->sibling_list, group_entry) | 462 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
445 | event_sched_out(event, cpuctx, ctx); | 463 | event_sched_out(event, cpuctx, ctx); |
446 | 464 | ||
447 | if (group_event->attr.exclusive) | 465 | if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) |
448 | cpuctx->exclusive = 0; | 466 | cpuctx->exclusive = 0; |
449 | } | 467 | } |
450 | 468 | ||
@@ -675,7 +693,6 @@ group_sched_in(struct perf_event *group_event, | |||
675 | struct perf_event *event, *partial_group = NULL; | 693 | struct perf_event *event, *partial_group = NULL; |
676 | const struct pmu *pmu = group_event->pmu; | 694 | const struct pmu *pmu = group_event->pmu; |
677 | bool txn = false; | 695 | bool txn = false; |
678 | int ret; | ||
679 | 696 | ||
680 | if (group_event->state == PERF_EVENT_STATE_OFF) | 697 | if (group_event->state == PERF_EVENT_STATE_OFF) |
681 | return 0; | 698 | return 0; |
@@ -703,14 +720,8 @@ group_sched_in(struct perf_event *group_event, | |||
703 | } | 720 | } |
704 | } | 721 | } |
705 | 722 | ||
706 | if (!txn) | 723 | if (!txn || !pmu->commit_txn(pmu)) |
707 | return 0; | ||
708 | |||
709 | ret = pmu->commit_txn(pmu); | ||
710 | if (!ret) { | ||
711 | pmu->cancel_txn(pmu); | ||
712 | return 0; | 724 | return 0; |
713 | } | ||
714 | 725 | ||
715 | group_error: | 726 | group_error: |
716 | /* | 727 | /* |
@@ -1155,9 +1166,9 @@ static void __perf_event_sync_stat(struct perf_event *event, | |||
1155 | * In order to keep per-task stats reliable we need to flip the event | 1166 | * In order to keep per-task stats reliable we need to flip the event |
1156 | * values when we flip the contexts. | 1167 | * values when we flip the contexts. |
1157 | */ | 1168 | */ |
1158 | value = atomic64_read(&next_event->count); | 1169 | value = local64_read(&next_event->count); |
1159 | value = atomic64_xchg(&event->count, value); | 1170 | value = local64_xchg(&event->count, value); |
1160 | atomic64_set(&next_event->count, value); | 1171 | local64_set(&next_event->count, value); |
1161 | 1172 | ||
1162 | swap(event->total_time_enabled, next_event->total_time_enabled); | 1173 | swap(event->total_time_enabled, next_event->total_time_enabled); |
1163 | swap(event->total_time_running, next_event->total_time_running); | 1174 | swap(event->total_time_running, next_event->total_time_running); |
@@ -1547,10 +1558,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
1547 | 1558 | ||
1548 | hwc->sample_period = sample_period; | 1559 | hwc->sample_period = sample_period; |
1549 | 1560 | ||
1550 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { | 1561 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
1551 | perf_disable(); | 1562 | perf_disable(); |
1552 | perf_event_stop(event); | 1563 | perf_event_stop(event); |
1553 | atomic64_set(&hwc->period_left, 0); | 1564 | local64_set(&hwc->period_left, 0); |
1554 | perf_event_start(event); | 1565 | perf_event_start(event); |
1555 | perf_enable(); | 1566 | perf_enable(); |
1556 | } | 1567 | } |
@@ -1591,7 +1602,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1591 | 1602 | ||
1592 | perf_disable(); | 1603 | perf_disable(); |
1593 | event->pmu->read(event); | 1604 | event->pmu->read(event); |
1594 | now = atomic64_read(&event->count); | 1605 | now = local64_read(&event->count); |
1595 | delta = now - hwc->freq_count_stamp; | 1606 | delta = now - hwc->freq_count_stamp; |
1596 | hwc->freq_count_stamp = now; | 1607 | hwc->freq_count_stamp = now; |
1597 | 1608 | ||
@@ -1743,6 +1754,11 @@ static void __perf_event_read(void *info) | |||
1743 | event->pmu->read(event); | 1754 | event->pmu->read(event); |
1744 | } | 1755 | } |
1745 | 1756 | ||
1757 | static inline u64 perf_event_count(struct perf_event *event) | ||
1758 | { | ||
1759 | return local64_read(&event->count) + atomic64_read(&event->child_count); | ||
1760 | } | ||
1761 | |||
1746 | static u64 perf_event_read(struct perf_event *event) | 1762 | static u64 perf_event_read(struct perf_event *event) |
1747 | { | 1763 | { |
1748 | /* | 1764 | /* |
@@ -1762,7 +1778,7 @@ static u64 perf_event_read(struct perf_event *event) | |||
1762 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1778 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1763 | } | 1779 | } |
1764 | 1780 | ||
1765 | return atomic64_read(&event->count); | 1781 | return perf_event_count(event); |
1766 | } | 1782 | } |
1767 | 1783 | ||
1768 | /* | 1784 | /* |
@@ -1883,7 +1899,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
1883 | } | 1899 | } |
1884 | 1900 | ||
1885 | static void perf_pending_sync(struct perf_event *event); | 1901 | static void perf_pending_sync(struct perf_event *event); |
1886 | static void perf_mmap_data_put(struct perf_mmap_data *data); | 1902 | static void perf_buffer_put(struct perf_buffer *buffer); |
1887 | 1903 | ||
1888 | static void free_event(struct perf_event *event) | 1904 | static void free_event(struct perf_event *event) |
1889 | { | 1905 | { |
@@ -1891,7 +1907,7 @@ static void free_event(struct perf_event *event) | |||
1891 | 1907 | ||
1892 | if (!event->parent) { | 1908 | if (!event->parent) { |
1893 | atomic_dec(&nr_events); | 1909 | atomic_dec(&nr_events); |
1894 | if (event->attr.mmap) | 1910 | if (event->attr.mmap || event->attr.mmap_data) |
1895 | atomic_dec(&nr_mmap_events); | 1911 | atomic_dec(&nr_mmap_events); |
1896 | if (event->attr.comm) | 1912 | if (event->attr.comm) |
1897 | atomic_dec(&nr_comm_events); | 1913 | atomic_dec(&nr_comm_events); |
@@ -1899,9 +1915,9 @@ static void free_event(struct perf_event *event) | |||
1899 | atomic_dec(&nr_task_events); | 1915 | atomic_dec(&nr_task_events); |
1900 | } | 1916 | } |
1901 | 1917 | ||
1902 | if (event->data) { | 1918 | if (event->buffer) { |
1903 | perf_mmap_data_put(event->data); | 1919 | perf_buffer_put(event->buffer); |
1904 | event->data = NULL; | 1920 | event->buffer = NULL; |
1905 | } | 1921 | } |
1906 | 1922 | ||
1907 | if (event->destroy) | 1923 | if (event->destroy) |
@@ -2126,13 +2142,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
2126 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 2142 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
2127 | { | 2143 | { |
2128 | struct perf_event *event = file->private_data; | 2144 | struct perf_event *event = file->private_data; |
2129 | struct perf_mmap_data *data; | 2145 | struct perf_buffer *buffer; |
2130 | unsigned int events = POLL_HUP; | 2146 | unsigned int events = POLL_HUP; |
2131 | 2147 | ||
2132 | rcu_read_lock(); | 2148 | rcu_read_lock(); |
2133 | data = rcu_dereference(event->data); | 2149 | buffer = rcu_dereference(event->buffer); |
2134 | if (data) | 2150 | if (buffer) |
2135 | events = atomic_xchg(&data->poll, 0); | 2151 | events = atomic_xchg(&buffer->poll, 0); |
2136 | rcu_read_unlock(); | 2152 | rcu_read_unlock(); |
2137 | 2153 | ||
2138 | poll_wait(file, &event->waitq, wait); | 2154 | poll_wait(file, &event->waitq, wait); |
@@ -2143,7 +2159,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
2143 | static void perf_event_reset(struct perf_event *event) | 2159 | static void perf_event_reset(struct perf_event *event) |
2144 | { | 2160 | { |
2145 | (void)perf_event_read(event); | 2161 | (void)perf_event_read(event); |
2146 | atomic64_set(&event->count, 0); | 2162 | local64_set(&event->count, 0); |
2147 | perf_event_update_userpage(event); | 2163 | perf_event_update_userpage(event); |
2148 | } | 2164 | } |
2149 | 2165 | ||
@@ -2342,14 +2358,14 @@ static int perf_event_index(struct perf_event *event) | |||
2342 | void perf_event_update_userpage(struct perf_event *event) | 2358 | void perf_event_update_userpage(struct perf_event *event) |
2343 | { | 2359 | { |
2344 | struct perf_event_mmap_page *userpg; | 2360 | struct perf_event_mmap_page *userpg; |
2345 | struct perf_mmap_data *data; | 2361 | struct perf_buffer *buffer; |
2346 | 2362 | ||
2347 | rcu_read_lock(); | 2363 | rcu_read_lock(); |
2348 | data = rcu_dereference(event->data); | 2364 | buffer = rcu_dereference(event->buffer); |
2349 | if (!data) | 2365 | if (!buffer) |
2350 | goto unlock; | 2366 | goto unlock; |
2351 | 2367 | ||
2352 | userpg = data->user_page; | 2368 | userpg = buffer->user_page; |
2353 | 2369 | ||
2354 | /* | 2370 | /* |
2355 | * Disable preemption so as to not let the corresponding user-space | 2371 | * Disable preemption so as to not let the corresponding user-space |
@@ -2359,9 +2375,9 @@ void perf_event_update_userpage(struct perf_event *event) | |||
2359 | ++userpg->lock; | 2375 | ++userpg->lock; |
2360 | barrier(); | 2376 | barrier(); |
2361 | userpg->index = perf_event_index(event); | 2377 | userpg->index = perf_event_index(event); |
2362 | userpg->offset = atomic64_read(&event->count); | 2378 | userpg->offset = perf_event_count(event); |
2363 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2379 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
2364 | userpg->offset -= atomic64_read(&event->hw.prev_count); | 2380 | userpg->offset -= local64_read(&event->hw.prev_count); |
2365 | 2381 | ||
2366 | userpg->time_enabled = event->total_time_enabled + | 2382 | userpg->time_enabled = event->total_time_enabled + |
2367 | atomic64_read(&event->child_total_time_enabled); | 2383 | atomic64_read(&event->child_total_time_enabled); |
@@ -2376,6 +2392,25 @@ unlock: | |||
2376 | rcu_read_unlock(); | 2392 | rcu_read_unlock(); |
2377 | } | 2393 | } |
2378 | 2394 | ||
2395 | static unsigned long perf_data_size(struct perf_buffer *buffer); | ||
2396 | |||
2397 | static void | ||
2398 | perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) | ||
2399 | { | ||
2400 | long max_size = perf_data_size(buffer); | ||
2401 | |||
2402 | if (watermark) | ||
2403 | buffer->watermark = min(max_size, watermark); | ||
2404 | |||
2405 | if (!buffer->watermark) | ||
2406 | buffer->watermark = max_size / 2; | ||
2407 | |||
2408 | if (flags & PERF_BUFFER_WRITABLE) | ||
2409 | buffer->writable = 1; | ||
2410 | |||
2411 | atomic_set(&buffer->refcount, 1); | ||
2412 | } | ||
2413 | |||
2379 | #ifndef CONFIG_PERF_USE_VMALLOC | 2414 | #ifndef CONFIG_PERF_USE_VMALLOC |
2380 | 2415 | ||
2381 | /* | 2416 | /* |
@@ -2383,15 +2418,15 @@ unlock: | |||
2383 | */ | 2418 | */ |
2384 | 2419 | ||
2385 | static struct page * | 2420 | static struct page * |
2386 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2421 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
2387 | { | 2422 | { |
2388 | if (pgoff > data->nr_pages) | 2423 | if (pgoff > buffer->nr_pages) |
2389 | return NULL; | 2424 | return NULL; |
2390 | 2425 | ||
2391 | if (pgoff == 0) | 2426 | if (pgoff == 0) |
2392 | return virt_to_page(data->user_page); | 2427 | return virt_to_page(buffer->user_page); |
2393 | 2428 | ||
2394 | return virt_to_page(data->data_pages[pgoff - 1]); | 2429 | return virt_to_page(buffer->data_pages[pgoff - 1]); |
2395 | } | 2430 | } |
2396 | 2431 | ||
2397 | static void *perf_mmap_alloc_page(int cpu) | 2432 | static void *perf_mmap_alloc_page(int cpu) |
@@ -2407,42 +2442,44 @@ static void *perf_mmap_alloc_page(int cpu) | |||
2407 | return page_address(page); | 2442 | return page_address(page); |
2408 | } | 2443 | } |
2409 | 2444 | ||
2410 | static struct perf_mmap_data * | 2445 | static struct perf_buffer * |
2411 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2446 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
2412 | { | 2447 | { |
2413 | struct perf_mmap_data *data; | 2448 | struct perf_buffer *buffer; |
2414 | unsigned long size; | 2449 | unsigned long size; |
2415 | int i; | 2450 | int i; |
2416 | 2451 | ||
2417 | size = sizeof(struct perf_mmap_data); | 2452 | size = sizeof(struct perf_buffer); |
2418 | size += nr_pages * sizeof(void *); | 2453 | size += nr_pages * sizeof(void *); |
2419 | 2454 | ||
2420 | data = kzalloc(size, GFP_KERNEL); | 2455 | buffer = kzalloc(size, GFP_KERNEL); |
2421 | if (!data) | 2456 | if (!buffer) |
2422 | goto fail; | 2457 | goto fail; |
2423 | 2458 | ||
2424 | data->user_page = perf_mmap_alloc_page(event->cpu); | 2459 | buffer->user_page = perf_mmap_alloc_page(cpu); |
2425 | if (!data->user_page) | 2460 | if (!buffer->user_page) |
2426 | goto fail_user_page; | 2461 | goto fail_user_page; |
2427 | 2462 | ||
2428 | for (i = 0; i < nr_pages; i++) { | 2463 | for (i = 0; i < nr_pages; i++) { |
2429 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); | 2464 | buffer->data_pages[i] = perf_mmap_alloc_page(cpu); |
2430 | if (!data->data_pages[i]) | 2465 | if (!buffer->data_pages[i]) |
2431 | goto fail_data_pages; | 2466 | goto fail_data_pages; |
2432 | } | 2467 | } |
2433 | 2468 | ||
2434 | data->nr_pages = nr_pages; | 2469 | buffer->nr_pages = nr_pages; |
2435 | 2470 | ||
2436 | return data; | 2471 | perf_buffer_init(buffer, watermark, flags); |
2472 | |||
2473 | return buffer; | ||
2437 | 2474 | ||
2438 | fail_data_pages: | 2475 | fail_data_pages: |
2439 | for (i--; i >= 0; i--) | 2476 | for (i--; i >= 0; i--) |
2440 | free_page((unsigned long)data->data_pages[i]); | 2477 | free_page((unsigned long)buffer->data_pages[i]); |
2441 | 2478 | ||
2442 | free_page((unsigned long)data->user_page); | 2479 | free_page((unsigned long)buffer->user_page); |
2443 | 2480 | ||
2444 | fail_user_page: | 2481 | fail_user_page: |
2445 | kfree(data); | 2482 | kfree(buffer); |
2446 | 2483 | ||
2447 | fail: | 2484 | fail: |
2448 | return NULL; | 2485 | return NULL; |
@@ -2456,17 +2493,17 @@ static void perf_mmap_free_page(unsigned long addr) | |||
2456 | __free_page(page); | 2493 | __free_page(page); |
2457 | } | 2494 | } |
2458 | 2495 | ||
2459 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2496 | static void perf_buffer_free(struct perf_buffer *buffer) |
2460 | { | 2497 | { |
2461 | int i; | 2498 | int i; |
2462 | 2499 | ||
2463 | perf_mmap_free_page((unsigned long)data->user_page); | 2500 | perf_mmap_free_page((unsigned long)buffer->user_page); |
2464 | for (i = 0; i < data->nr_pages; i++) | 2501 | for (i = 0; i < buffer->nr_pages; i++) |
2465 | perf_mmap_free_page((unsigned long)data->data_pages[i]); | 2502 | perf_mmap_free_page((unsigned long)buffer->data_pages[i]); |
2466 | kfree(data); | 2503 | kfree(buffer); |
2467 | } | 2504 | } |
2468 | 2505 | ||
2469 | static inline int page_order(struct perf_mmap_data *data) | 2506 | static inline int page_order(struct perf_buffer *buffer) |
2470 | { | 2507 | { |
2471 | return 0; | 2508 | return 0; |
2472 | } | 2509 | } |
@@ -2479,18 +2516,18 @@ static inline int page_order(struct perf_mmap_data *data) | |||
2479 | * Required for architectures that have d-cache aliasing issues. | 2516 | * Required for architectures that have d-cache aliasing issues. |
2480 | */ | 2517 | */ |
2481 | 2518 | ||
2482 | static inline int page_order(struct perf_mmap_data *data) | 2519 | static inline int page_order(struct perf_buffer *buffer) |
2483 | { | 2520 | { |
2484 | return data->page_order; | 2521 | return buffer->page_order; |
2485 | } | 2522 | } |
2486 | 2523 | ||
2487 | static struct page * | 2524 | static struct page * |
2488 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2525 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
2489 | { | 2526 | { |
2490 | if (pgoff > (1UL << page_order(data))) | 2527 | if (pgoff > (1UL << page_order(buffer))) |
2491 | return NULL; | 2528 | return NULL; |
2492 | 2529 | ||
2493 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); | 2530 | return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); |
2494 | } | 2531 | } |
2495 | 2532 | ||
2496 | static void perf_mmap_unmark_page(void *addr) | 2533 | static void perf_mmap_unmark_page(void *addr) |
@@ -2500,57 +2537,59 @@ static void perf_mmap_unmark_page(void *addr) | |||
2500 | page->mapping = NULL; | 2537 | page->mapping = NULL; |
2501 | } | 2538 | } |
2502 | 2539 | ||
2503 | static void perf_mmap_data_free_work(struct work_struct *work) | 2540 | static void perf_buffer_free_work(struct work_struct *work) |
2504 | { | 2541 | { |
2505 | struct perf_mmap_data *data; | 2542 | struct perf_buffer *buffer; |
2506 | void *base; | 2543 | void *base; |
2507 | int i, nr; | 2544 | int i, nr; |
2508 | 2545 | ||
2509 | data = container_of(work, struct perf_mmap_data, work); | 2546 | buffer = container_of(work, struct perf_buffer, work); |
2510 | nr = 1 << page_order(data); | 2547 | nr = 1 << page_order(buffer); |
2511 | 2548 | ||
2512 | base = data->user_page; | 2549 | base = buffer->user_page; |
2513 | for (i = 0; i < nr + 1; i++) | 2550 | for (i = 0; i < nr + 1; i++) |
2514 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | 2551 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); |
2515 | 2552 | ||
2516 | vfree(base); | 2553 | vfree(base); |
2517 | kfree(data); | 2554 | kfree(buffer); |
2518 | } | 2555 | } |
2519 | 2556 | ||
2520 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2557 | static void perf_buffer_free(struct perf_buffer *buffer) |
2521 | { | 2558 | { |
2522 | schedule_work(&data->work); | 2559 | schedule_work(&buffer->work); |
2523 | } | 2560 | } |
2524 | 2561 | ||
2525 | static struct perf_mmap_data * | 2562 | static struct perf_buffer * |
2526 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2563 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
2527 | { | 2564 | { |
2528 | struct perf_mmap_data *data; | 2565 | struct perf_buffer *buffer; |
2529 | unsigned long size; | 2566 | unsigned long size; |
2530 | void *all_buf; | 2567 | void *all_buf; |
2531 | 2568 | ||
2532 | size = sizeof(struct perf_mmap_data); | 2569 | size = sizeof(struct perf_buffer); |
2533 | size += sizeof(void *); | 2570 | size += sizeof(void *); |
2534 | 2571 | ||
2535 | data = kzalloc(size, GFP_KERNEL); | 2572 | buffer = kzalloc(size, GFP_KERNEL); |
2536 | if (!data) | 2573 | if (!buffer) |
2537 | goto fail; | 2574 | goto fail; |
2538 | 2575 | ||
2539 | INIT_WORK(&data->work, perf_mmap_data_free_work); | 2576 | INIT_WORK(&buffer->work, perf_buffer_free_work); |
2540 | 2577 | ||
2541 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | 2578 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); |
2542 | if (!all_buf) | 2579 | if (!all_buf) |
2543 | goto fail_all_buf; | 2580 | goto fail_all_buf; |
2544 | 2581 | ||
2545 | data->user_page = all_buf; | 2582 | buffer->user_page = all_buf; |
2546 | data->data_pages[0] = all_buf + PAGE_SIZE; | 2583 | buffer->data_pages[0] = all_buf + PAGE_SIZE; |
2547 | data->page_order = ilog2(nr_pages); | 2584 | buffer->page_order = ilog2(nr_pages); |
2548 | data->nr_pages = 1; | 2585 | buffer->nr_pages = 1; |
2586 | |||
2587 | perf_buffer_init(buffer, watermark, flags); | ||
2549 | 2588 | ||
2550 | return data; | 2589 | return buffer; |
2551 | 2590 | ||
2552 | fail_all_buf: | 2591 | fail_all_buf: |
2553 | kfree(data); | 2592 | kfree(buffer); |
2554 | 2593 | ||
2555 | fail: | 2594 | fail: |
2556 | return NULL; | 2595 | return NULL; |
@@ -2558,15 +2597,15 @@ fail: | |||
2558 | 2597 | ||
2559 | #endif | 2598 | #endif |
2560 | 2599 | ||
2561 | static unsigned long perf_data_size(struct perf_mmap_data *data) | 2600 | static unsigned long perf_data_size(struct perf_buffer *buffer) |
2562 | { | 2601 | { |
2563 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); | 2602 | return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); |
2564 | } | 2603 | } |
2565 | 2604 | ||
2566 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 2605 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
2567 | { | 2606 | { |
2568 | struct perf_event *event = vma->vm_file->private_data; | 2607 | struct perf_event *event = vma->vm_file->private_data; |
2569 | struct perf_mmap_data *data; | 2608 | struct perf_buffer *buffer; |
2570 | int ret = VM_FAULT_SIGBUS; | 2609 | int ret = VM_FAULT_SIGBUS; |
2571 | 2610 | ||
2572 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | 2611 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
@@ -2576,14 +2615,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
2576 | } | 2615 | } |
2577 | 2616 | ||
2578 | rcu_read_lock(); | 2617 | rcu_read_lock(); |
2579 | data = rcu_dereference(event->data); | 2618 | buffer = rcu_dereference(event->buffer); |
2580 | if (!data) | 2619 | if (!buffer) |
2581 | goto unlock; | 2620 | goto unlock; |
2582 | 2621 | ||
2583 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) | 2622 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
2584 | goto unlock; | 2623 | goto unlock; |
2585 | 2624 | ||
2586 | vmf->page = perf_mmap_to_page(data, vmf->pgoff); | 2625 | vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); |
2587 | if (!vmf->page) | 2626 | if (!vmf->page) |
2588 | goto unlock; | 2627 | goto unlock; |
2589 | 2628 | ||
@@ -2598,52 +2637,35 @@ unlock: | |||
2598 | return ret; | 2637 | return ret; |
2599 | } | 2638 | } |
2600 | 2639 | ||
2601 | static void | 2640 | static void perf_buffer_free_rcu(struct rcu_head *rcu_head) |
2602 | perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | ||
2603 | { | 2641 | { |
2604 | long max_size = perf_data_size(data); | 2642 | struct perf_buffer *buffer; |
2605 | 2643 | ||
2606 | if (event->attr.watermark) { | 2644 | buffer = container_of(rcu_head, struct perf_buffer, rcu_head); |
2607 | data->watermark = min_t(long, max_size, | 2645 | perf_buffer_free(buffer); |
2608 | event->attr.wakeup_watermark); | ||
2609 | } | ||
2610 | |||
2611 | if (!data->watermark) | ||
2612 | data->watermark = max_size / 2; | ||
2613 | |||
2614 | atomic_set(&data->refcount, 1); | ||
2615 | rcu_assign_pointer(event->data, data); | ||
2616 | } | 2646 | } |
2617 | 2647 | ||
2618 | static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) | 2648 | static struct perf_buffer *perf_buffer_get(struct perf_event *event) |
2619 | { | 2649 | { |
2620 | struct perf_mmap_data *data; | 2650 | struct perf_buffer *buffer; |
2621 | |||
2622 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); | ||
2623 | perf_mmap_data_free(data); | ||
2624 | } | ||
2625 | |||
2626 | static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) | ||
2627 | { | ||
2628 | struct perf_mmap_data *data; | ||
2629 | 2651 | ||
2630 | rcu_read_lock(); | 2652 | rcu_read_lock(); |
2631 | data = rcu_dereference(event->data); | 2653 | buffer = rcu_dereference(event->buffer); |
2632 | if (data) { | 2654 | if (buffer) { |
2633 | if (!atomic_inc_not_zero(&data->refcount)) | 2655 | if (!atomic_inc_not_zero(&buffer->refcount)) |
2634 | data = NULL; | 2656 | buffer = NULL; |
2635 | } | 2657 | } |
2636 | rcu_read_unlock(); | 2658 | rcu_read_unlock(); |
2637 | 2659 | ||
2638 | return data; | 2660 | return buffer; |
2639 | } | 2661 | } |
2640 | 2662 | ||
2641 | static void perf_mmap_data_put(struct perf_mmap_data *data) | 2663 | static void perf_buffer_put(struct perf_buffer *buffer) |
2642 | { | 2664 | { |
2643 | if (!atomic_dec_and_test(&data->refcount)) | 2665 | if (!atomic_dec_and_test(&buffer->refcount)) |
2644 | return; | 2666 | return; |
2645 | 2667 | ||
2646 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); | 2668 | call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); |
2647 | } | 2669 | } |
2648 | 2670 | ||
2649 | static void perf_mmap_open(struct vm_area_struct *vma) | 2671 | static void perf_mmap_open(struct vm_area_struct *vma) |
@@ -2658,16 +2680,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
2658 | struct perf_event *event = vma->vm_file->private_data; | 2680 | struct perf_event *event = vma->vm_file->private_data; |
2659 | 2681 | ||
2660 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 2682 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
2661 | unsigned long size = perf_data_size(event->data); | 2683 | unsigned long size = perf_data_size(event->buffer); |
2662 | struct user_struct *user = event->mmap_user; | 2684 | struct user_struct *user = event->mmap_user; |
2663 | struct perf_mmap_data *data = event->data; | 2685 | struct perf_buffer *buffer = event->buffer; |
2664 | 2686 | ||
2665 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 2687 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
2666 | vma->vm_mm->locked_vm -= event->mmap_locked; | 2688 | vma->vm_mm->locked_vm -= event->mmap_locked; |
2667 | rcu_assign_pointer(event->data, NULL); | 2689 | rcu_assign_pointer(event->buffer, NULL); |
2668 | mutex_unlock(&event->mmap_mutex); | 2690 | mutex_unlock(&event->mmap_mutex); |
2669 | 2691 | ||
2670 | perf_mmap_data_put(data); | 2692 | perf_buffer_put(buffer); |
2671 | free_uid(user); | 2693 | free_uid(user); |
2672 | } | 2694 | } |
2673 | } | 2695 | } |
@@ -2685,11 +2707,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2685 | unsigned long user_locked, user_lock_limit; | 2707 | unsigned long user_locked, user_lock_limit; |
2686 | struct user_struct *user = current_user(); | 2708 | struct user_struct *user = current_user(); |
2687 | unsigned long locked, lock_limit; | 2709 | unsigned long locked, lock_limit; |
2688 | struct perf_mmap_data *data; | 2710 | struct perf_buffer *buffer; |
2689 | unsigned long vma_size; | 2711 | unsigned long vma_size; |
2690 | unsigned long nr_pages; | 2712 | unsigned long nr_pages; |
2691 | long user_extra, extra; | 2713 | long user_extra, extra; |
2692 | int ret = 0; | 2714 | int ret = 0, flags = 0; |
2693 | 2715 | ||
2694 | /* | 2716 | /* |
2695 | * Don't allow mmap() of inherited per-task counters. This would | 2717 | * Don't allow mmap() of inherited per-task counters. This would |
@@ -2706,7 +2728,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2706 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 2728 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
2707 | 2729 | ||
2708 | /* | 2730 | /* |
2709 | * If we have data pages ensure they're a power-of-two number, so we | 2731 | * If we have buffer pages ensure they're a power-of-two number, so we |
2710 | * can do bitmasks instead of modulo. | 2732 | * can do bitmasks instead of modulo. |
2711 | */ | 2733 | */ |
2712 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | 2734 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
@@ -2720,9 +2742,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2720 | 2742 | ||
2721 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2743 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2722 | mutex_lock(&event->mmap_mutex); | 2744 | mutex_lock(&event->mmap_mutex); |
2723 | if (event->data) { | 2745 | if (event->buffer) { |
2724 | if (event->data->nr_pages == nr_pages) | 2746 | if (event->buffer->nr_pages == nr_pages) |
2725 | atomic_inc(&event->data->refcount); | 2747 | atomic_inc(&event->buffer->refcount); |
2726 | else | 2748 | else |
2727 | ret = -EINVAL; | 2749 | ret = -EINVAL; |
2728 | goto unlock; | 2750 | goto unlock; |
@@ -2752,17 +2774,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
2752 | goto unlock; | 2774 | goto unlock; |
2753 | } | 2775 | } |
2754 | 2776 | ||
2755 | WARN_ON(event->data); | 2777 | WARN_ON(event->buffer); |
2778 | |||
2779 | if (vma->vm_flags & VM_WRITE) | ||
2780 | flags |= PERF_BUFFER_WRITABLE; | ||
2756 | 2781 | ||
2757 | data = perf_mmap_data_alloc(event, nr_pages); | 2782 | buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, |
2758 | if (!data) { | 2783 | event->cpu, flags); |
2784 | if (!buffer) { | ||
2759 | ret = -ENOMEM; | 2785 | ret = -ENOMEM; |
2760 | goto unlock; | 2786 | goto unlock; |
2761 | } | 2787 | } |
2762 | 2788 | rcu_assign_pointer(event->buffer, buffer); | |
2763 | perf_mmap_data_init(event, data); | ||
2764 | if (vma->vm_flags & VM_WRITE) | ||
2765 | event->data->writable = 1; | ||
2766 | 2789 | ||
2767 | atomic_long_add(user_extra, &user->locked_vm); | 2790 | atomic_long_add(user_extra, &user->locked_vm); |
2768 | event->mmap_locked = extra; | 2791 | event->mmap_locked = extra; |
@@ -2941,11 +2964,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
2941 | return NULL; | 2964 | return NULL; |
2942 | } | 2965 | } |
2943 | 2966 | ||
2944 | __weak | ||
2945 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
2946 | { | ||
2947 | } | ||
2948 | |||
2949 | 2967 | ||
2950 | /* | 2968 | /* |
2951 | * We assume there is only KVM supporting the callbacks. | 2969 | * We assume there is only KVM supporting the callbacks. |
@@ -2971,15 +2989,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | |||
2971 | /* | 2989 | /* |
2972 | * Output | 2990 | * Output |
2973 | */ | 2991 | */ |
2974 | static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | 2992 | static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, |
2975 | unsigned long offset, unsigned long head) | 2993 | unsigned long offset, unsigned long head) |
2976 | { | 2994 | { |
2977 | unsigned long mask; | 2995 | unsigned long mask; |
2978 | 2996 | ||
2979 | if (!data->writable) | 2997 | if (!buffer->writable) |
2980 | return true; | 2998 | return true; |
2981 | 2999 | ||
2982 | mask = perf_data_size(data) - 1; | 3000 | mask = perf_data_size(buffer) - 1; |
2983 | 3001 | ||
2984 | offset = (offset - tail) & mask; | 3002 | offset = (offset - tail) & mask; |
2985 | head = (head - tail) & mask; | 3003 | head = (head - tail) & mask; |
@@ -2992,7 +3010,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | |||
2992 | 3010 | ||
2993 | static void perf_output_wakeup(struct perf_output_handle *handle) | 3011 | static void perf_output_wakeup(struct perf_output_handle *handle) |
2994 | { | 3012 | { |
2995 | atomic_set(&handle->data->poll, POLL_IN); | 3013 | atomic_set(&handle->buffer->poll, POLL_IN); |
2996 | 3014 | ||
2997 | if (handle->nmi) { | 3015 | if (handle->nmi) { |
2998 | handle->event->pending_wakeup = 1; | 3016 | handle->event->pending_wakeup = 1; |
@@ -3012,45 +3030,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
3012 | */ | 3030 | */ |
3013 | static void perf_output_get_handle(struct perf_output_handle *handle) | 3031 | static void perf_output_get_handle(struct perf_output_handle *handle) |
3014 | { | 3032 | { |
3015 | struct perf_mmap_data *data = handle->data; | 3033 | struct perf_buffer *buffer = handle->buffer; |
3016 | 3034 | ||
3017 | preempt_disable(); | 3035 | preempt_disable(); |
3018 | local_inc(&data->nest); | 3036 | local_inc(&buffer->nest); |
3019 | handle->wakeup = local_read(&data->wakeup); | 3037 | handle->wakeup = local_read(&buffer->wakeup); |
3020 | } | 3038 | } |
3021 | 3039 | ||
3022 | static void perf_output_put_handle(struct perf_output_handle *handle) | 3040 | static void perf_output_put_handle(struct perf_output_handle *handle) |
3023 | { | 3041 | { |
3024 | struct perf_mmap_data *data = handle->data; | 3042 | struct perf_buffer *buffer = handle->buffer; |
3025 | unsigned long head; | 3043 | unsigned long head; |
3026 | 3044 | ||
3027 | again: | 3045 | again: |
3028 | head = local_read(&data->head); | 3046 | head = local_read(&buffer->head); |
3029 | 3047 | ||
3030 | /* | 3048 | /* |
3031 | * IRQ/NMI can happen here, which means we can miss a head update. | 3049 | * IRQ/NMI can happen here, which means we can miss a head update. |
3032 | */ | 3050 | */ |
3033 | 3051 | ||
3034 | if (!local_dec_and_test(&data->nest)) | 3052 | if (!local_dec_and_test(&buffer->nest)) |
3035 | goto out; | 3053 | goto out; |
3036 | 3054 | ||
3037 | /* | 3055 | /* |
3038 | * Publish the known good head. Rely on the full barrier implied | 3056 | * Publish the known good head. Rely on the full barrier implied |
3039 | * by atomic_dec_and_test() order the data->head read and this | 3057 | * by atomic_dec_and_test() order the buffer->head read and this |
3040 | * write. | 3058 | * write. |
3041 | */ | 3059 | */ |
3042 | data->user_page->data_head = head; | 3060 | buffer->user_page->data_head = head; |
3043 | 3061 | ||
3044 | /* | 3062 | /* |
3045 | * Now check if we missed an update, rely on the (compiler) | 3063 | * Now check if we missed an update, rely on the (compiler) |
3046 | * barrier in atomic_dec_and_test() to re-read data->head. | 3064 | * barrier in atomic_dec_and_test() to re-read buffer->head. |
3047 | */ | 3065 | */ |
3048 | if (unlikely(head != local_read(&data->head))) { | 3066 | if (unlikely(head != local_read(&buffer->head))) { |
3049 | local_inc(&data->nest); | 3067 | local_inc(&buffer->nest); |
3050 | goto again; | 3068 | goto again; |
3051 | } | 3069 | } |
3052 | 3070 | ||
3053 | if (handle->wakeup != local_read(&data->wakeup)) | 3071 | if (handle->wakeup != local_read(&buffer->wakeup)) |
3054 | perf_output_wakeup(handle); | 3072 | perf_output_wakeup(handle); |
3055 | 3073 | ||
3056 | out: | 3074 | out: |
@@ -3070,12 +3088,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
3070 | buf += size; | 3088 | buf += size; |
3071 | handle->size -= size; | 3089 | handle->size -= size; |
3072 | if (!handle->size) { | 3090 | if (!handle->size) { |
3073 | struct perf_mmap_data *data = handle->data; | 3091 | struct perf_buffer *buffer = handle->buffer; |
3074 | 3092 | ||
3075 | handle->page++; | 3093 | handle->page++; |
3076 | handle->page &= data->nr_pages - 1; | 3094 | handle->page &= buffer->nr_pages - 1; |
3077 | handle->addr = data->data_pages[handle->page]; | 3095 | handle->addr = buffer->data_pages[handle->page]; |
3078 | handle->size = PAGE_SIZE << page_order(data); | 3096 | handle->size = PAGE_SIZE << page_order(buffer); |
3079 | } | 3097 | } |
3080 | } while (len); | 3098 | } while (len); |
3081 | } | 3099 | } |
@@ -3084,7 +3102,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3084 | struct perf_event *event, unsigned int size, | 3102 | struct perf_event *event, unsigned int size, |
3085 | int nmi, int sample) | 3103 | int nmi, int sample) |
3086 | { | 3104 | { |
3087 | struct perf_mmap_data *data; | 3105 | struct perf_buffer *buffer; |
3088 | unsigned long tail, offset, head; | 3106 | unsigned long tail, offset, head; |
3089 | int have_lost; | 3107 | int have_lost; |
3090 | struct { | 3108 | struct { |
@@ -3100,19 +3118,19 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3100 | if (event->parent) | 3118 | if (event->parent) |
3101 | event = event->parent; | 3119 | event = event->parent; |
3102 | 3120 | ||
3103 | data = rcu_dereference(event->data); | 3121 | buffer = rcu_dereference(event->buffer); |
3104 | if (!data) | 3122 | if (!buffer) |
3105 | goto out; | 3123 | goto out; |
3106 | 3124 | ||
3107 | handle->data = data; | 3125 | handle->buffer = buffer; |
3108 | handle->event = event; | 3126 | handle->event = event; |
3109 | handle->nmi = nmi; | 3127 | handle->nmi = nmi; |
3110 | handle->sample = sample; | 3128 | handle->sample = sample; |
3111 | 3129 | ||
3112 | if (!data->nr_pages) | 3130 | if (!buffer->nr_pages) |
3113 | goto out; | 3131 | goto out; |
3114 | 3132 | ||
3115 | have_lost = local_read(&data->lost); | 3133 | have_lost = local_read(&buffer->lost); |
3116 | if (have_lost) | 3134 | if (have_lost) |
3117 | size += sizeof(lost_event); | 3135 | size += sizeof(lost_event); |
3118 | 3136 | ||
@@ -3124,30 +3142,30 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3124 | * tail pointer. So that all reads will be completed before the | 3142 | * tail pointer. So that all reads will be completed before the |
3125 | * write is issued. | 3143 | * write is issued. |
3126 | */ | 3144 | */ |
3127 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3145 | tail = ACCESS_ONCE(buffer->user_page->data_tail); |
3128 | smp_rmb(); | 3146 | smp_rmb(); |
3129 | offset = head = local_read(&data->head); | 3147 | offset = head = local_read(&buffer->head); |
3130 | head += size; | 3148 | head += size; |
3131 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3149 | if (unlikely(!perf_output_space(buffer, tail, offset, head))) |
3132 | goto fail; | 3150 | goto fail; |
3133 | } while (local_cmpxchg(&data->head, offset, head) != offset); | 3151 | } while (local_cmpxchg(&buffer->head, offset, head) != offset); |
3134 | 3152 | ||
3135 | if (head - local_read(&data->wakeup) > data->watermark) | 3153 | if (head - local_read(&buffer->wakeup) > buffer->watermark) |
3136 | local_add(data->watermark, &data->wakeup); | 3154 | local_add(buffer->watermark, &buffer->wakeup); |
3137 | 3155 | ||
3138 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); | 3156 | handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); |
3139 | handle->page &= data->nr_pages - 1; | 3157 | handle->page &= buffer->nr_pages - 1; |
3140 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); | 3158 | handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); |
3141 | handle->addr = data->data_pages[handle->page]; | 3159 | handle->addr = buffer->data_pages[handle->page]; |
3142 | handle->addr += handle->size; | 3160 | handle->addr += handle->size; |
3143 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; | 3161 | handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; |
3144 | 3162 | ||
3145 | if (have_lost) { | 3163 | if (have_lost) { |
3146 | lost_event.header.type = PERF_RECORD_LOST; | 3164 | lost_event.header.type = PERF_RECORD_LOST; |
3147 | lost_event.header.misc = 0; | 3165 | lost_event.header.misc = 0; |
3148 | lost_event.header.size = sizeof(lost_event); | 3166 | lost_event.header.size = sizeof(lost_event); |
3149 | lost_event.id = event->id; | 3167 | lost_event.id = event->id; |
3150 | lost_event.lost = local_xchg(&data->lost, 0); | 3168 | lost_event.lost = local_xchg(&buffer->lost, 0); |
3151 | 3169 | ||
3152 | perf_output_put(handle, lost_event); | 3170 | perf_output_put(handle, lost_event); |
3153 | } | 3171 | } |
@@ -3155,7 +3173,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
3155 | return 0; | 3173 | return 0; |
3156 | 3174 | ||
3157 | fail: | 3175 | fail: |
3158 | local_inc(&data->lost); | 3176 | local_inc(&buffer->lost); |
3159 | perf_output_put_handle(handle); | 3177 | perf_output_put_handle(handle); |
3160 | out: | 3178 | out: |
3161 | rcu_read_unlock(); | 3179 | rcu_read_unlock(); |
@@ -3166,15 +3184,15 @@ out: | |||
3166 | void perf_output_end(struct perf_output_handle *handle) | 3184 | void perf_output_end(struct perf_output_handle *handle) |
3167 | { | 3185 | { |
3168 | struct perf_event *event = handle->event; | 3186 | struct perf_event *event = handle->event; |
3169 | struct perf_mmap_data *data = handle->data; | 3187 | struct perf_buffer *buffer = handle->buffer; |
3170 | 3188 | ||
3171 | int wakeup_events = event->attr.wakeup_events; | 3189 | int wakeup_events = event->attr.wakeup_events; |
3172 | 3190 | ||
3173 | if (handle->sample && wakeup_events) { | 3191 | if (handle->sample && wakeup_events) { |
3174 | int events = local_inc_return(&data->events); | 3192 | int events = local_inc_return(&buffer->events); |
3175 | if (events >= wakeup_events) { | 3193 | if (events >= wakeup_events) { |
3176 | local_sub(wakeup_events, &data->events); | 3194 | local_sub(wakeup_events, &buffer->events); |
3177 | local_inc(&data->wakeup); | 3195 | local_inc(&buffer->wakeup); |
3178 | } | 3196 | } |
3179 | } | 3197 | } |
3180 | 3198 | ||
@@ -3211,7 +3229,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
3211 | u64 values[4]; | 3229 | u64 values[4]; |
3212 | int n = 0; | 3230 | int n = 0; |
3213 | 3231 | ||
3214 | values[n++] = atomic64_read(&event->count); | 3232 | values[n++] = perf_event_count(event); |
3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 3233 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
3216 | values[n++] = event->total_time_enabled + | 3234 | values[n++] = event->total_time_enabled + |
3217 | atomic64_read(&event->child_total_time_enabled); | 3235 | atomic64_read(&event->child_total_time_enabled); |
@@ -3248,7 +3266,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3248 | if (leader != event) | 3266 | if (leader != event) |
3249 | leader->pmu->read(leader); | 3267 | leader->pmu->read(leader); |
3250 | 3268 | ||
3251 | values[n++] = atomic64_read(&leader->count); | 3269 | values[n++] = perf_event_count(leader); |
3252 | if (read_format & PERF_FORMAT_ID) | 3270 | if (read_format & PERF_FORMAT_ID) |
3253 | values[n++] = primary_event_id(leader); | 3271 | values[n++] = primary_event_id(leader); |
3254 | 3272 | ||
@@ -3260,7 +3278,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
3260 | if (sub != event) | 3278 | if (sub != event) |
3261 | sub->pmu->read(sub); | 3279 | sub->pmu->read(sub); |
3262 | 3280 | ||
3263 | values[n++] = atomic64_read(&sub->count); | 3281 | values[n++] = perf_event_count(sub); |
3264 | if (read_format & PERF_FORMAT_ID) | 3282 | if (read_format & PERF_FORMAT_ID) |
3265 | values[n++] = primary_event_id(sub); | 3283 | values[n++] = primary_event_id(sub); |
3266 | 3284 | ||
@@ -3491,7 +3509,7 @@ perf_event_read_event(struct perf_event *event, | |||
3491 | /* | 3509 | /* |
3492 | * task tracking -- fork/exit | 3510 | * task tracking -- fork/exit |
3493 | * | 3511 | * |
3494 | * enabled by: attr.comm | attr.mmap | attr.task | 3512 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task |
3495 | */ | 3513 | */ |
3496 | 3514 | ||
3497 | struct perf_task_event { | 3515 | struct perf_task_event { |
@@ -3541,7 +3559,8 @@ static int perf_event_task_match(struct perf_event *event) | |||
3541 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3559 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3542 | return 0; | 3560 | return 0; |
3543 | 3561 | ||
3544 | if (event->attr.comm || event->attr.mmap || event->attr.task) | 3562 | if (event->attr.comm || event->attr.mmap || |
3563 | event->attr.mmap_data || event->attr.task) | ||
3545 | return 1; | 3564 | return 1; |
3546 | 3565 | ||
3547 | return 0; | 3566 | return 0; |
@@ -3766,7 +3785,8 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
3766 | } | 3785 | } |
3767 | 3786 | ||
3768 | static int perf_event_mmap_match(struct perf_event *event, | 3787 | static int perf_event_mmap_match(struct perf_event *event, |
3769 | struct perf_mmap_event *mmap_event) | 3788 | struct perf_mmap_event *mmap_event, |
3789 | int executable) | ||
3770 | { | 3790 | { |
3771 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 3791 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3772 | return 0; | 3792 | return 0; |
@@ -3774,19 +3794,21 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
3774 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3794 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3775 | return 0; | 3795 | return 0; |
3776 | 3796 | ||
3777 | if (event->attr.mmap) | 3797 | if ((!executable && event->attr.mmap_data) || |
3798 | (executable && event->attr.mmap)) | ||
3778 | return 1; | 3799 | return 1; |
3779 | 3800 | ||
3780 | return 0; | 3801 | return 0; |
3781 | } | 3802 | } |
3782 | 3803 | ||
3783 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, | 3804 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, |
3784 | struct perf_mmap_event *mmap_event) | 3805 | struct perf_mmap_event *mmap_event, |
3806 | int executable) | ||
3785 | { | 3807 | { |
3786 | struct perf_event *event; | 3808 | struct perf_event *event; |
3787 | 3809 | ||
3788 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 3810 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
3789 | if (perf_event_mmap_match(event, mmap_event)) | 3811 | if (perf_event_mmap_match(event, mmap_event, executable)) |
3790 | perf_event_mmap_output(event, mmap_event); | 3812 | perf_event_mmap_output(event, mmap_event); |
3791 | } | 3813 | } |
3792 | } | 3814 | } |
@@ -3830,6 +3852,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
3830 | if (!vma->vm_mm) { | 3852 | if (!vma->vm_mm) { |
3831 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | 3853 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); |
3832 | goto got_name; | 3854 | goto got_name; |
3855 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
3856 | vma->vm_end >= vma->vm_mm->brk) { | ||
3857 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | ||
3858 | goto got_name; | ||
3859 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | ||
3860 | vma->vm_end >= vma->vm_mm->start_stack) { | ||
3861 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | ||
3862 | goto got_name; | ||
3833 | } | 3863 | } |
3834 | 3864 | ||
3835 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 3865 | name = strncpy(tmp, "//anon", sizeof(tmp)); |
@@ -3846,17 +3876,17 @@ got_name: | |||
3846 | 3876 | ||
3847 | rcu_read_lock(); | 3877 | rcu_read_lock(); |
3848 | cpuctx = &get_cpu_var(perf_cpu_context); | 3878 | cpuctx = &get_cpu_var(perf_cpu_context); |
3849 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); | 3879 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); |
3850 | ctx = rcu_dereference(current->perf_event_ctxp); | 3880 | ctx = rcu_dereference(current->perf_event_ctxp); |
3851 | if (ctx) | 3881 | if (ctx) |
3852 | perf_event_mmap_ctx(ctx, mmap_event); | 3882 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); |
3853 | put_cpu_var(perf_cpu_context); | 3883 | put_cpu_var(perf_cpu_context); |
3854 | rcu_read_unlock(); | 3884 | rcu_read_unlock(); |
3855 | 3885 | ||
3856 | kfree(buf); | 3886 | kfree(buf); |
3857 | } | 3887 | } |
3858 | 3888 | ||
3859 | void __perf_event_mmap(struct vm_area_struct *vma) | 3889 | void perf_event_mmap(struct vm_area_struct *vma) |
3860 | { | 3890 | { |
3861 | struct perf_mmap_event mmap_event; | 3891 | struct perf_mmap_event mmap_event; |
3862 | 3892 | ||
@@ -4018,14 +4048,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) | |||
4018 | hwc->last_period = hwc->sample_period; | 4048 | hwc->last_period = hwc->sample_period; |
4019 | 4049 | ||
4020 | again: | 4050 | again: |
4021 | old = val = atomic64_read(&hwc->period_left); | 4051 | old = val = local64_read(&hwc->period_left); |
4022 | if (val < 0) | 4052 | if (val < 0) |
4023 | return 0; | 4053 | return 0; |
4024 | 4054 | ||
4025 | nr = div64_u64(period + val, period); | 4055 | nr = div64_u64(period + val, period); |
4026 | offset = nr * period; | 4056 | offset = nr * period; |
4027 | val -= offset; | 4057 | val -= offset; |
4028 | if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) | 4058 | if (local64_cmpxchg(&hwc->period_left, old, val) != old) |
4029 | goto again; | 4059 | goto again; |
4030 | 4060 | ||
4031 | return nr; | 4061 | return nr; |
@@ -4064,7 +4094,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4064 | { | 4094 | { |
4065 | struct hw_perf_event *hwc = &event->hw; | 4095 | struct hw_perf_event *hwc = &event->hw; |
4066 | 4096 | ||
4067 | atomic64_add(nr, &event->count); | 4097 | local64_add(nr, &event->count); |
4068 | 4098 | ||
4069 | if (!regs) | 4099 | if (!regs) |
4070 | return; | 4100 | return; |
@@ -4075,7 +4105,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
4075 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4105 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4076 | return perf_swevent_overflow(event, 1, nmi, data, regs); | 4106 | return perf_swevent_overflow(event, 1, nmi, data, regs); |
4077 | 4107 | ||
4078 | if (atomic64_add_negative(nr, &hwc->period_left)) | 4108 | if (local64_add_negative(nr, &hwc->period_left)) |
4079 | return; | 4109 | return; |
4080 | 4110 | ||
4081 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4111 | perf_swevent_overflow(event, 0, nmi, data, regs); |
@@ -4213,14 +4243,12 @@ int perf_swevent_get_recursion_context(void) | |||
4213 | } | 4243 | } |
4214 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4244 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4215 | 4245 | ||
4216 | void perf_swevent_put_recursion_context(int rctx) | 4246 | void inline perf_swevent_put_recursion_context(int rctx) |
4217 | { | 4247 | { |
4218 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4248 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
4219 | barrier(); | 4249 | barrier(); |
4220 | cpuctx->recursion[rctx]--; | 4250 | cpuctx->recursion[rctx]--; |
4221 | } | 4251 | } |
4222 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | ||
4223 | |||
4224 | 4252 | ||
4225 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4253 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
4226 | struct pt_regs *regs, u64 addr) | 4254 | struct pt_regs *regs, u64 addr) |
@@ -4368,8 +4396,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) | |||
4368 | u64 now; | 4396 | u64 now; |
4369 | 4397 | ||
4370 | now = cpu_clock(cpu); | 4398 | now = cpu_clock(cpu); |
4371 | prev = atomic64_xchg(&event->hw.prev_count, now); | 4399 | prev = local64_xchg(&event->hw.prev_count, now); |
4372 | atomic64_add(now - prev, &event->count); | 4400 | local64_add(now - prev, &event->count); |
4373 | } | 4401 | } |
4374 | 4402 | ||
4375 | static int cpu_clock_perf_event_enable(struct perf_event *event) | 4403 | static int cpu_clock_perf_event_enable(struct perf_event *event) |
@@ -4377,7 +4405,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) | |||
4377 | struct hw_perf_event *hwc = &event->hw; | 4405 | struct hw_perf_event *hwc = &event->hw; |
4378 | int cpu = raw_smp_processor_id(); | 4406 | int cpu = raw_smp_processor_id(); |
4379 | 4407 | ||
4380 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); | 4408 | local64_set(&hwc->prev_count, cpu_clock(cpu)); |
4381 | perf_swevent_start_hrtimer(event); | 4409 | perf_swevent_start_hrtimer(event); |
4382 | 4410 | ||
4383 | return 0; | 4411 | return 0; |
@@ -4409,9 +4437,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) | |||
4409 | u64 prev; | 4437 | u64 prev; |
4410 | s64 delta; | 4438 | s64 delta; |
4411 | 4439 | ||
4412 | prev = atomic64_xchg(&event->hw.prev_count, now); | 4440 | prev = local64_xchg(&event->hw.prev_count, now); |
4413 | delta = now - prev; | 4441 | delta = now - prev; |
4414 | atomic64_add(delta, &event->count); | 4442 | local64_add(delta, &event->count); |
4415 | } | 4443 | } |
4416 | 4444 | ||
4417 | static int task_clock_perf_event_enable(struct perf_event *event) | 4445 | static int task_clock_perf_event_enable(struct perf_event *event) |
@@ -4421,7 +4449,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) | |||
4421 | 4449 | ||
4422 | now = event->ctx->time; | 4450 | now = event->ctx->time; |
4423 | 4451 | ||
4424 | atomic64_set(&hwc->prev_count, now); | 4452 | local64_set(&hwc->prev_count, now); |
4425 | 4453 | ||
4426 | perf_swevent_start_hrtimer(event); | 4454 | perf_swevent_start_hrtimer(event); |
4427 | 4455 | ||
@@ -4601,7 +4629,7 @@ static int perf_tp_event_match(struct perf_event *event, | |||
4601 | } | 4629 | } |
4602 | 4630 | ||
4603 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | 4631 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, |
4604 | struct pt_regs *regs, struct hlist_head *head) | 4632 | struct pt_regs *regs, struct hlist_head *head, int rctx) |
4605 | { | 4633 | { |
4606 | struct perf_sample_data data; | 4634 | struct perf_sample_data data; |
4607 | struct perf_event *event; | 4635 | struct perf_event *event; |
@@ -4615,12 +4643,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
4615 | perf_sample_data_init(&data, addr); | 4643 | perf_sample_data_init(&data, addr); |
4616 | data.raw = &raw; | 4644 | data.raw = &raw; |
4617 | 4645 | ||
4618 | rcu_read_lock(); | ||
4619 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4646 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4620 | if (perf_tp_event_match(event, &data, regs)) | 4647 | if (perf_tp_event_match(event, &data, regs)) |
4621 | perf_swevent_add(event, count, 1, &data, regs); | 4648 | perf_swevent_add(event, count, 1, &data, regs); |
4622 | } | 4649 | } |
4623 | rcu_read_unlock(); | 4650 | |
4651 | perf_swevent_put_recursion_context(rctx); | ||
4624 | } | 4652 | } |
4625 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4653 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4626 | 4654 | ||
@@ -4864,7 +4892,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
4864 | hwc->sample_period = 1; | 4892 | hwc->sample_period = 1; |
4865 | hwc->last_period = hwc->sample_period; | 4893 | hwc->last_period = hwc->sample_period; |
4866 | 4894 | ||
4867 | atomic64_set(&hwc->period_left, hwc->sample_period); | 4895 | local64_set(&hwc->period_left, hwc->sample_period); |
4868 | 4896 | ||
4869 | /* | 4897 | /* |
4870 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 4898 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
@@ -4913,7 +4941,7 @@ done: | |||
4913 | 4941 | ||
4914 | if (!event->parent) { | 4942 | if (!event->parent) { |
4915 | atomic_inc(&nr_events); | 4943 | atomic_inc(&nr_events); |
4916 | if (event->attr.mmap) | 4944 | if (event->attr.mmap || event->attr.mmap_data) |
4917 | atomic_inc(&nr_mmap_events); | 4945 | atomic_inc(&nr_mmap_events); |
4918 | if (event->attr.comm) | 4946 | if (event->attr.comm) |
4919 | atomic_inc(&nr_comm_events); | 4947 | atomic_inc(&nr_comm_events); |
@@ -5007,7 +5035,7 @@ err_size: | |||
5007 | static int | 5035 | static int |
5008 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 5036 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
5009 | { | 5037 | { |
5010 | struct perf_mmap_data *data = NULL, *old_data = NULL; | 5038 | struct perf_buffer *buffer = NULL, *old_buffer = NULL; |
5011 | int ret = -EINVAL; | 5039 | int ret = -EINVAL; |
5012 | 5040 | ||
5013 | if (!output_event) | 5041 | if (!output_event) |
@@ -5037,19 +5065,19 @@ set: | |||
5037 | 5065 | ||
5038 | if (output_event) { | 5066 | if (output_event) { |
5039 | /* get the buffer we want to redirect to */ | 5067 | /* get the buffer we want to redirect to */ |
5040 | data = perf_mmap_data_get(output_event); | 5068 | buffer = perf_buffer_get(output_event); |
5041 | if (!data) | 5069 | if (!buffer) |
5042 | goto unlock; | 5070 | goto unlock; |
5043 | } | 5071 | } |
5044 | 5072 | ||
5045 | old_data = event->data; | 5073 | old_buffer = event->buffer; |
5046 | rcu_assign_pointer(event->data, data); | 5074 | rcu_assign_pointer(event->buffer, buffer); |
5047 | ret = 0; | 5075 | ret = 0; |
5048 | unlock: | 5076 | unlock: |
5049 | mutex_unlock(&event->mmap_mutex); | 5077 | mutex_unlock(&event->mmap_mutex); |
5050 | 5078 | ||
5051 | if (old_data) | 5079 | if (old_buffer) |
5052 | perf_mmap_data_put(old_data); | 5080 | perf_buffer_put(old_buffer); |
5053 | out: | 5081 | out: |
5054 | return ret; | 5082 | return ret; |
5055 | } | 5083 | } |
@@ -5298,7 +5326,7 @@ inherit_event(struct perf_event *parent_event, | |||
5298 | hwc->sample_period = sample_period; | 5326 | hwc->sample_period = sample_period; |
5299 | hwc->last_period = sample_period; | 5327 | hwc->last_period = sample_period; |
5300 | 5328 | ||
5301 | atomic64_set(&hwc->period_left, sample_period); | 5329 | local64_set(&hwc->period_left, sample_period); |
5302 | } | 5330 | } |
5303 | 5331 | ||
5304 | child_event->overflow_handler = parent_event->overflow_handler; | 5332 | child_event->overflow_handler = parent_event->overflow_handler; |
@@ -5359,12 +5387,12 @@ static void sync_child_event(struct perf_event *child_event, | |||
5359 | if (child_event->attr.inherit_stat) | 5387 | if (child_event->attr.inherit_stat) |
5360 | perf_event_read_event(child_event, child); | 5388 | perf_event_read_event(child_event, child); |
5361 | 5389 | ||
5362 | child_val = atomic64_read(&child_event->count); | 5390 | child_val = perf_event_count(child_event); |
5363 | 5391 | ||
5364 | /* | 5392 | /* |
5365 | * Add back the child's count to the parent's count: | 5393 | * Add back the child's count to the parent's count: |
5366 | */ | 5394 | */ |
5367 | atomic64_add(child_val, &parent_event->count); | 5395 | atomic64_add(child_val, &parent_event->child_count); |
5368 | atomic64_add(child_event->total_time_enabled, | 5396 | atomic64_add(child_event->total_time_enabled, |
5369 | &parent_event->child_total_time_enabled); | 5397 | &parent_event->child_total_time_enabled); |
5370 | atomic64_add(child_event->total_time_running, | 5398 | atomic64_add(child_event->total_time_running, |
@@ -5733,15 +5761,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | |||
5733 | { | 5761 | { |
5734 | unsigned int cpu = (long)hcpu; | 5762 | unsigned int cpu = (long)hcpu; |
5735 | 5763 | ||
5736 | switch (action) { | 5764 | switch (action & ~CPU_TASKS_FROZEN) { |
5737 | 5765 | ||
5738 | case CPU_UP_PREPARE: | 5766 | case CPU_UP_PREPARE: |
5739 | case CPU_UP_PREPARE_FROZEN: | 5767 | case CPU_DOWN_FAILED: |
5740 | perf_event_init_cpu(cpu); | 5768 | perf_event_init_cpu(cpu); |
5741 | break; | 5769 | break; |
5742 | 5770 | ||
5771 | case CPU_UP_CANCELED: | ||
5743 | case CPU_DOWN_PREPARE: | 5772 | case CPU_DOWN_PREPARE: |
5744 | case CPU_DOWN_PREPARE_FROZEN: | ||
5745 | perf_event_exit_cpu(cpu); | 5773 | perf_event_exit_cpu(cpu); |
5746 | break; | 5774 | break; |
5747 | 5775 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c132d26..d55c6fb8d087 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid) | |||
122 | atomic_inc(&map->nr_free); | 122 | atomic_inc(&map->nr_free); |
123 | } | 123 | } |
124 | 124 | ||
125 | /* | ||
126 | * If we started walking pids at 'base', is 'a' seen before 'b'? | ||
127 | */ | ||
128 | static int pid_before(int base, int a, int b) | ||
129 | { | ||
130 | /* | ||
131 | * This is the same as saying | ||
132 | * | ||
133 | * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT | ||
134 | * and that mapping orders 'a' and 'b' with respect to 'base'. | ||
135 | */ | ||
136 | return (unsigned)(a - base) < (unsigned)(b - base); | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | ||
141 | * We want the winner to have the "later" value, because if the | ||
142 | * "earlier" value prevails, then a pid may get reused immediately. | ||
143 | * | ||
144 | * Since pids rollover, it is not sufficient to just pick the bigger | ||
145 | * value. We have to consider where we started counting from. | ||
146 | * | ||
147 | * 'base' is the value of pid_ns->last_pid that we observed when | ||
148 | * we started looking for a pid. | ||
149 | * | ||
150 | * 'pid' is the pid that we eventually found. | ||
151 | */ | ||
152 | static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) | ||
153 | { | ||
154 | int prev; | ||
155 | int last_write = base; | ||
156 | do { | ||
157 | prev = last_write; | ||
158 | last_write = cmpxchg(&pid_ns->last_pid, prev, pid); | ||
159 | } while ((prev != last_write) && (pid_before(base, last_write, pid))); | ||
160 | } | ||
161 | |||
125 | static int alloc_pidmap(struct pid_namespace *pid_ns) | 162 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
126 | { | 163 | { |
127 | int i, offset, max_scan, pid, last = pid_ns->last_pid; | 164 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
132 | pid = RESERVED_PIDS; | 169 | pid = RESERVED_PIDS; |
133 | offset = pid & BITS_PER_PAGE_MASK; | 170 | offset = pid & BITS_PER_PAGE_MASK; |
134 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; | 171 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
135 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | 172 | /* |
173 | * If last_pid points into the middle of the map->page we | ||
174 | * want to scan this bitmap block twice, the second time | ||
175 | * we start with offset == 0 (or RESERVED_PIDS). | ||
176 | */ | ||
177 | max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; | ||
136 | for (i = 0; i <= max_scan; ++i) { | 178 | for (i = 0; i <= max_scan; ++i) { |
137 | if (unlikely(!map->page)) { | 179 | if (unlikely(!map->page)) { |
138 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 180 | void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
154 | do { | 196 | do { |
155 | if (!test_and_set_bit(offset, map->page)) { | 197 | if (!test_and_set_bit(offset, map->page)) { |
156 | atomic_dec(&map->nr_free); | 198 | atomic_dec(&map->nr_free); |
157 | pid_ns->last_pid = pid; | 199 | set_last_pid(pid_ns, last, pid); |
158 | return pid; | 200 | return pid; |
159 | } | 201 | } |
160 | offset = find_next_offset(map, offset); | 202 | offset = find_next_offset(map, offset); |
161 | pid = mk_pid(pid_ns, map, offset); | 203 | pid = mk_pid(pid_ns, map, offset); |
162 | /* | 204 | } while (offset < BITS_PER_PAGE && pid < pid_max); |
163 | * find_next_offset() found a bit, the pid from it | ||
164 | * is in-bounds, and if we fell back to the last | ||
165 | * bitmap block and the final block was the same | ||
166 | * as the starting point, pid is before last_pid. | ||
167 | */ | ||
168 | } while (offset < BITS_PER_PAGE && pid < pid_max && | ||
169 | (i != max_scan || pid < last || | ||
170 | !((last+1) & BITS_PER_PAGE_MASK))); | ||
171 | } | 205 | } |
172 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 206 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
173 | ++map; | 207 | ++map; |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 996a4dec5f96..645e541a45f6 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
@@ -212,15 +212,17 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); | |||
212 | 212 | ||
213 | /** | 213 | /** |
214 | * pm_qos_add_request - inserts new qos request into the list | 214 | * pm_qos_add_request - inserts new qos request into the list |
215 | * @pm_qos_class: identifies which list of qos request to us | 215 | * @dep: pointer to a preallocated handle |
216 | * @pm_qos_class: identifies which list of qos request to use | ||
216 | * @value: defines the qos request | 217 | * @value: defines the qos request |
217 | * | 218 | * |
218 | * This function inserts a new entry in the pm_qos_class list of requested qos | 219 | * This function inserts a new entry in the pm_qos_class list of requested qos |
219 | * performance characteristics. It recomputes the aggregate QoS expectations | 220 | * performance characteristics. It recomputes the aggregate QoS expectations |
220 | * for the pm_qos_class of parameters, and returns the pm_qos_request list | 221 | * for the pm_qos_class of parameters and initializes the pm_qos_request_list |
221 | * element as a handle for use in updating and removal. Call needs to save | 222 | * handle. Caller needs to save this handle for later use in updates and |
222 | * this handle for later use. | 223 | * removal. |
223 | */ | 224 | */ |
225 | |||
224 | void pm_qos_add_request(struct pm_qos_request_list *dep, | 226 | void pm_qos_add_request(struct pm_qos_request_list *dep, |
225 | int pm_qos_class, s32 value) | 227 | int pm_qos_class, s32 value) |
226 | { | 228 | { |
@@ -348,7 +350,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) | |||
348 | 350 | ||
349 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); | 351 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); |
350 | if (pm_qos_class >= 0) { | 352 | if (pm_qos_class >= 0) { |
351 | struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); | 353 | struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); |
352 | if (!req) | 354 | if (!req) |
353 | return -ENOMEM; | 355 | return -ENOMEM; |
354 | 356 | ||
@@ -387,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
387 | } else if (count == 11) { /* len('0x12345678/0') */ | 389 | } else if (count == 11) { /* len('0x12345678/0') */ |
388 | if (copy_from_user(ascii_value, buf, 11)) | 390 | if (copy_from_user(ascii_value, buf, 11)) |
389 | return -EFAULT; | 391 | return -EFAULT; |
392 | if (strlen(ascii_value) != 10) | ||
393 | return -EINVAL; | ||
390 | x = sscanf(ascii_value, "%x", &value); | 394 | x = sscanf(ascii_value, "%x", &value); |
391 | if (x != 1) | 395 | if (x != 1) |
392 | return -EINVAL; | 396 | return -EINVAL; |
393 | pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); | 397 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); |
394 | } else | 398 | } else |
395 | return -EINVAL; | 399 | return -EINVAL; |
396 | 400 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..6842eeba5879 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -16,13 +16,13 @@ | |||
16 | * siglock protection since other code may update expiration cache as | 16 | * siglock protection since other code may update expiration cache as |
17 | * well. | 17 | * well. |
18 | */ | 18 | */ |
19 | void update_rlimit_cpu(unsigned long rlim_new) | 19 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) |
20 | { | 20 | { |
21 | cputime_t cputime = secs_to_cputime(rlim_new); | 21 | cputime_t cputime = secs_to_cputime(rlim_new); |
22 | 22 | ||
23 | spin_lock_irq(¤t->sighand->siglock); | 23 | spin_lock_irq(&task->sighand->siglock); |
24 | set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); | 24 | set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); |
25 | spin_unlock_irq(¤t->sighand->siglock); | 25 | spin_unlock_irq(&task->sighand->siglock); |
26 | } | 26 | } |
27 | 27 | ||
28 | static int check_clock(const clockid_t which_clock) | 28 | static int check_clock(const clockid_t which_clock) |
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
232 | 232 | ||
233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
234 | { | 234 | { |
235 | struct sighand_struct *sighand; | 235 | struct signal_struct *sig = tsk->signal; |
236 | struct signal_struct *sig; | ||
237 | struct task_struct *t; | 236 | struct task_struct *t; |
238 | 237 | ||
239 | *times = INIT_CPUTIME; | 238 | times->utime = sig->utime; |
239 | times->stime = sig->stime; | ||
240 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
240 | 241 | ||
241 | rcu_read_lock(); | 242 | rcu_read_lock(); |
242 | sighand = rcu_dereference(tsk->sighand); | 243 | /* make sure we can trust tsk->thread_group list */ |
243 | if (!sighand) | 244 | if (!likely(pid_alive(tsk))) |
244 | goto out; | 245 | goto out; |
245 | 246 | ||
246 | sig = tsk->signal; | ||
247 | |||
248 | t = tsk; | 247 | t = tsk; |
249 | do { | 248 | do { |
250 | times->utime = cputime_add(times->utime, t->utime); | 249 | times->utime = cputime_add(times->utime, t->utime); |
251 | times->stime = cputime_add(times->stime, t->stime); | 250 | times->stime = cputime_add(times->stime, t->stime); |
252 | times->sum_exec_runtime += t->se.sum_exec_runtime; | 251 | times->sum_exec_runtime += t->se.sum_exec_runtime; |
253 | 252 | } while_each_thread(tsk, t); | |
254 | t = next_thread(t); | ||
255 | } while (t != tsk); | ||
256 | |||
257 | times->utime = cputime_add(times->utime, sig->utime); | ||
258 | times->stime = cputime_add(times->stime, sig->stime); | ||
259 | times->sum_exec_runtime += sig->sum_sched_runtime; | ||
260 | out: | 253 | out: |
261 | rcu_read_unlock(); | 254 | rcu_read_unlock(); |
262 | } | 255 | } |
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1279 | { | 1272 | { |
1280 | struct signal_struct *sig; | 1273 | struct signal_struct *sig; |
1281 | 1274 | ||
1282 | /* tsk == current, ensure it is safe to use ->signal/sighand */ | ||
1283 | if (unlikely(tsk->exit_state)) | ||
1284 | return 0; | ||
1285 | |||
1286 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1275 | if (!task_cputime_zero(&tsk->cputime_expires)) { |
1287 | struct task_cputime task_sample = { | 1276 | struct task_cputime task_sample = { |
1288 | .utime = tsk->utime, | 1277 | .utime = tsk->utime, |
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1298 | if (sig->cputimer.running) { | 1287 | if (sig->cputimer.running) { |
1299 | struct task_cputime group_sample; | 1288 | struct task_cputime group_sample; |
1300 | 1289 | ||
1301 | thread_group_cputimer(tsk, &group_sample); | 1290 | spin_lock(&sig->cputimer.lock); |
1291 | group_sample = sig->cputimer.cputime; | ||
1292 | spin_unlock(&sig->cputimer.lock); | ||
1293 | |||
1302 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1294 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1303 | return 1; | 1295 | return 1; |
1304 | } | 1296 | } |
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1315 | { | 1307 | { |
1316 | LIST_HEAD(firing); | 1308 | LIST_HEAD(firing); |
1317 | struct k_itimer *timer, *next; | 1309 | struct k_itimer *timer, *next; |
1310 | unsigned long flags; | ||
1318 | 1311 | ||
1319 | BUG_ON(!irqs_disabled()); | 1312 | BUG_ON(!irqs_disabled()); |
1320 | 1313 | ||
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1325 | if (!fastpath_timer_check(tsk)) | 1318 | if (!fastpath_timer_check(tsk)) |
1326 | return; | 1319 | return; |
1327 | 1320 | ||
1328 | spin_lock(&tsk->sighand->siglock); | 1321 | if (!lock_task_sighand(tsk, &flags)) |
1322 | return; | ||
1329 | /* | 1323 | /* |
1330 | * Here we take off tsk->signal->cpu_timers[N] and | 1324 | * Here we take off tsk->signal->cpu_timers[N] and |
1331 | * tsk->cpu_timers[N] all the timers that are firing, and | 1325 | * tsk->cpu_timers[N] all the timers that are firing, and |
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1347 | * that gets the timer lock before we do will give it up and | 1341 | * that gets the timer lock before we do will give it up and |
1348 | * spin until we've taken care of that timer below. | 1342 | * spin until we've taken care of that timer below. |
1349 | */ | 1343 | */ |
1350 | spin_unlock(&tsk->sighand->siglock); | 1344 | unlock_task_sighand(tsk, &flags); |
1351 | 1345 | ||
1352 | /* | 1346 | /* |
1353 | * Now that all the timers on our list have the firing flag, | 1347 | * Now that all the timers on our list have the firing flag, |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
560 | new_timer->it_clock = which_clock; | 560 | new_timer->it_clock = which_clock; |
561 | new_timer->it_overrun = -1; | 561 | new_timer->it_overrun = -1; |
562 | 562 | ||
563 | if (copy_to_user(created_timer_id, | ||
564 | &new_timer_id, sizeof (new_timer_id))) { | ||
565 | error = -EFAULT; | ||
566 | goto out; | ||
567 | } | ||
568 | if (timer_event_spec) { | 563 | if (timer_event_spec) { |
569 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | 564 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { |
570 | error = -EFAULT; | 565 | error = -EFAULT; |
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
590 | new_timer->sigq->info.si_tid = new_timer->it_id; | 585 | new_timer->sigq->info.si_tid = new_timer->it_id; |
591 | new_timer->sigq->info.si_code = SI_TIMER; | 586 | new_timer->sigq->info.si_code = SI_TIMER; |
592 | 587 | ||
588 | if (copy_to_user(created_timer_id, | ||
589 | &new_timer_id, sizeof (new_timer_id))) { | ||
590 | error = -EFAULT; | ||
591 | goto out; | ||
592 | } | ||
593 | |||
593 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 594 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); |
594 | if (error) | 595 | if (error) |
595 | goto out; | 596 | goto out; |
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 97024fd40cd5..83bbc7c02df9 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c | |||
@@ -28,7 +28,7 @@ | |||
28 | static int submit(int rw, struct block_device *bdev, sector_t sector, | 28 | static int submit(int rw, struct block_device *bdev, sector_t sector, |
29 | struct page *page, struct bio **bio_chain) | 29 | struct page *page, struct bio **bio_chain) |
30 | { | 30 | { |
31 | const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); | 31 | const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; |
32 | struct bio *bio; | 32 | struct bio *bio; |
33 | 33 | ||
34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 34 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index e8b337006276..d52359374e85 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy) | |||
24 | 24 | ||
25 | static DECLARE_WORK(poweroff_work, do_poweroff); | 25 | static DECLARE_WORK(poweroff_work, do_poweroff); |
26 | 26 | ||
27 | static void handle_poweroff(int key, struct tty_struct *tty) | 27 | static void handle_poweroff(int key) |
28 | { | 28 | { |
29 | /* run sysrq poweroff on boot cpu */ | 29 | /* run sysrq poweroff on boot cpu */ |
30 | schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); | 30 | schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab6..028a99598f49 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/workqueue.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * Timeout for stopping processes | 21 | * Timeout for stopping processes |
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
35 | struct task_struct *g, *p; | 36 | struct task_struct *g, *p; |
36 | unsigned long end_time; | 37 | unsigned long end_time; |
37 | unsigned int todo; | 38 | unsigned int todo; |
39 | bool wq_busy = false; | ||
38 | struct timeval start, end; | 40 | struct timeval start, end; |
39 | u64 elapsed_csecs64; | 41 | u64 elapsed_csecs64; |
40 | unsigned int elapsed_csecs; | 42 | unsigned int elapsed_csecs; |
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only) | |||
42 | do_gettimeofday(&start); | 44 | do_gettimeofday(&start); |
43 | 45 | ||
44 | end_time = jiffies + TIMEOUT; | 46 | end_time = jiffies + TIMEOUT; |
47 | |||
48 | if (!sig_only) | ||
49 | freeze_workqueues_begin(); | ||
50 | |||
45 | while (true) { | 51 | while (true) { |
46 | todo = 0; | 52 | todo = 0; |
47 | read_lock(&tasklist_lock); | 53 | read_lock(&tasklist_lock); |
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
63 | todo++; | 69 | todo++; |
64 | } while_each_thread(g, p); | 70 | } while_each_thread(g, p); |
65 | read_unlock(&tasklist_lock); | 71 | read_unlock(&tasklist_lock); |
72 | |||
73 | if (!sig_only) { | ||
74 | wq_busy = freeze_workqueues_busy(); | ||
75 | todo += wq_busy; | ||
76 | } | ||
77 | |||
66 | if (!todo || time_after(jiffies, end_time)) | 78 | if (!todo || time_after(jiffies, end_time)) |
67 | break; | 79 | break; |
68 | 80 | ||
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
86 | */ | 98 | */ |
87 | printk("\n"); | 99 | printk("\n"); |
88 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 100 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " |
89 | "(%d tasks refusing to freeze):\n", | 101 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
90 | elapsed_csecs / 100, elapsed_csecs % 100, todo); | 102 | elapsed_csecs / 100, elapsed_csecs % 100, |
103 | todo - wq_busy, wq_busy); | ||
104 | |||
105 | thaw_workqueues(); | ||
106 | |||
91 | read_lock(&tasklist_lock); | 107 | read_lock(&tasklist_lock); |
92 | do_each_thread(g, p) { | 108 | do_each_thread(g, p) { |
93 | task_lock(p); | 109 | task_lock(p); |
@@ -157,6 +173,7 @@ void thaw_processes(void) | |||
157 | oom_killer_enable(); | 173 | oom_killer_enable(); |
158 | 174 | ||
159 | printk("Restarting tasks ... "); | 175 | printk("Restarting tasks ... "); |
176 | thaw_workqueues(); | ||
160 | thaw_tasks(true); | 177 | thaw_tasks(true); |
161 | thaw_tasks(false); | 178 | thaw_tasks(false); |
162 | schedule(); | 179 | schedule(); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f6cd6faf84fd..d3f795f01bbc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1121,9 +1121,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) | |||
1121 | return nr_alloc; | 1121 | return nr_alloc; |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | static unsigned long preallocate_image_memory(unsigned long nr_pages) | 1124 | static unsigned long preallocate_image_memory(unsigned long nr_pages, |
1125 | unsigned long avail_normal) | ||
1125 | { | 1126 | { |
1126 | return preallocate_image_pages(nr_pages, GFP_IMAGE); | 1127 | unsigned long alloc; |
1128 | |||
1129 | if (avail_normal <= alloc_normal) | ||
1130 | return 0; | ||
1131 | |||
1132 | alloc = avail_normal - alloc_normal; | ||
1133 | if (nr_pages < alloc) | ||
1134 | alloc = nr_pages; | ||
1135 | |||
1136 | return preallocate_image_pages(alloc, GFP_IMAGE); | ||
1127 | } | 1137 | } |
1128 | 1138 | ||
1129 | #ifdef CONFIG_HIGHMEM | 1139 | #ifdef CONFIG_HIGHMEM |
@@ -1169,15 +1179,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, | |||
1169 | */ | 1179 | */ |
1170 | static void free_unnecessary_pages(void) | 1180 | static void free_unnecessary_pages(void) |
1171 | { | 1181 | { |
1172 | unsigned long save_highmem, to_free_normal, to_free_highmem; | 1182 | unsigned long save, to_free_normal, to_free_highmem; |
1173 | 1183 | ||
1174 | to_free_normal = alloc_normal - count_data_pages(); | 1184 | save = count_data_pages(); |
1175 | save_highmem = count_highmem_pages(); | 1185 | if (alloc_normal >= save) { |
1176 | if (alloc_highmem > save_highmem) { | 1186 | to_free_normal = alloc_normal - save; |
1177 | to_free_highmem = alloc_highmem - save_highmem; | 1187 | save = 0; |
1188 | } else { | ||
1189 | to_free_normal = 0; | ||
1190 | save -= alloc_normal; | ||
1191 | } | ||
1192 | save += count_highmem_pages(); | ||
1193 | if (alloc_highmem >= save) { | ||
1194 | to_free_highmem = alloc_highmem - save; | ||
1178 | } else { | 1195 | } else { |
1179 | to_free_highmem = 0; | 1196 | to_free_highmem = 0; |
1180 | to_free_normal -= save_highmem - alloc_highmem; | 1197 | to_free_normal -= save - alloc_highmem; |
1181 | } | 1198 | } |
1182 | 1199 | ||
1183 | memory_bm_position_reset(©_bm); | 1200 | memory_bm_position_reset(©_bm); |
@@ -1258,7 +1275,7 @@ int hibernate_preallocate_memory(void) | |||
1258 | { | 1275 | { |
1259 | struct zone *zone; | 1276 | struct zone *zone; |
1260 | unsigned long saveable, size, max_size, count, highmem, pages = 0; | 1277 | unsigned long saveable, size, max_size, count, highmem, pages = 0; |
1261 | unsigned long alloc, save_highmem, pages_highmem; | 1278 | unsigned long alloc, save_highmem, pages_highmem, avail_normal; |
1262 | struct timeval start, stop; | 1279 | struct timeval start, stop; |
1263 | int error; | 1280 | int error; |
1264 | 1281 | ||
@@ -1295,6 +1312,7 @@ int hibernate_preallocate_memory(void) | |||
1295 | else | 1312 | else |
1296 | count += zone_page_state(zone, NR_FREE_PAGES); | 1313 | count += zone_page_state(zone, NR_FREE_PAGES); |
1297 | } | 1314 | } |
1315 | avail_normal = count; | ||
1298 | count += highmem; | 1316 | count += highmem; |
1299 | count -= totalreserve_pages; | 1317 | count -= totalreserve_pages; |
1300 | 1318 | ||
@@ -1309,12 +1327,21 @@ int hibernate_preallocate_memory(void) | |||
1309 | */ | 1327 | */ |
1310 | if (size >= saveable) { | 1328 | if (size >= saveable) { |
1311 | pages = preallocate_image_highmem(save_highmem); | 1329 | pages = preallocate_image_highmem(save_highmem); |
1312 | pages += preallocate_image_memory(saveable - pages); | 1330 | pages += preallocate_image_memory(saveable - pages, avail_normal); |
1313 | goto out; | 1331 | goto out; |
1314 | } | 1332 | } |
1315 | 1333 | ||
1316 | /* Estimate the minimum size of the image. */ | 1334 | /* Estimate the minimum size of the image. */ |
1317 | pages = minimum_image_size(saveable); | 1335 | pages = minimum_image_size(saveable); |
1336 | /* | ||
1337 | * To avoid excessive pressure on the normal zone, leave room in it to | ||
1338 | * accommodate an image of the minimum size (unless it's already too | ||
1339 | * small, in which case don't preallocate pages from it at all). | ||
1340 | */ | ||
1341 | if (avail_normal > pages) | ||
1342 | avail_normal -= pages; | ||
1343 | else | ||
1344 | avail_normal = 0; | ||
1318 | if (size < pages) | 1345 | if (size < pages) |
1319 | size = min_t(unsigned long, pages, max_size); | 1346 | size = min_t(unsigned long, pages, max_size); |
1320 | 1347 | ||
@@ -1335,16 +1362,34 @@ int hibernate_preallocate_memory(void) | |||
1335 | */ | 1362 | */ |
1336 | pages_highmem = preallocate_image_highmem(highmem / 2); | 1363 | pages_highmem = preallocate_image_highmem(highmem / 2); |
1337 | alloc = (count - max_size) - pages_highmem; | 1364 | alloc = (count - max_size) - pages_highmem; |
1338 | pages = preallocate_image_memory(alloc); | 1365 | pages = preallocate_image_memory(alloc, avail_normal); |
1339 | if (pages < alloc) | 1366 | if (pages < alloc) { |
1340 | goto err_out; | 1367 | /* We have exhausted non-highmem pages, try highmem. */ |
1341 | size = max_size - size; | 1368 | alloc -= pages; |
1342 | alloc = size; | 1369 | pages += pages_highmem; |
1343 | size = preallocate_highmem_fraction(size, highmem, count); | 1370 | pages_highmem = preallocate_image_highmem(alloc); |
1344 | pages_highmem += size; | 1371 | if (pages_highmem < alloc) |
1345 | alloc -= size; | 1372 | goto err_out; |
1346 | pages += preallocate_image_memory(alloc); | 1373 | pages += pages_highmem; |
1347 | pages += pages_highmem; | 1374 | /* |
1375 | * size is the desired number of saveable pages to leave in | ||
1376 | * memory, so try to preallocate (all memory - size) pages. | ||
1377 | */ | ||
1378 | alloc = (count - pages) - size; | ||
1379 | pages += preallocate_image_highmem(alloc); | ||
1380 | } else { | ||
1381 | /* | ||
1382 | * There are approximately max_size saveable pages at this point | ||
1383 | * and we want to reduce this number down to size. | ||
1384 | */ | ||
1385 | alloc = max_size - size; | ||
1386 | size = preallocate_highmem_fraction(alloc, highmem, count); | ||
1387 | pages_highmem += size; | ||
1388 | alloc -= size; | ||
1389 | size = preallocate_image_memory(alloc, avail_normal); | ||
1390 | pages_highmem += preallocate_image_highmem(alloc - size); | ||
1391 | pages += pages_highmem + size; | ||
1392 | } | ||
1348 | 1393 | ||
1349 | /* | 1394 | /* |
1350 | * We only need as many page frames for the image as there are saveable | 1395 | * We only need as many page frames for the image as there are saveable |
diff --git a/kernel/printk.c b/kernel/printk.c index 444b770c9595..8fe465ac008a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -37,6 +37,8 @@ | |||
37 | #include <linux/ratelimit.h> | 37 | #include <linux/ratelimit.h> |
38 | #include <linux/kmsg_dump.h> | 38 | #include <linux/kmsg_dump.h> |
39 | #include <linux/syslog.h> | 39 | #include <linux/syslog.h> |
40 | #include <linux/cpu.h> | ||
41 | #include <linux/notifier.h> | ||
40 | 42 | ||
41 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
42 | 44 | ||
@@ -985,6 +987,32 @@ void resume_console(void) | |||
985 | } | 987 | } |
986 | 988 | ||
987 | /** | 989 | /** |
990 | * console_cpu_notify - print deferred console messages after CPU hotplug | ||
991 | * @self: notifier struct | ||
992 | * @action: CPU hotplug event | ||
993 | * @hcpu: unused | ||
994 | * | ||
995 | * If printk() is called from a CPU that is not online yet, the messages | ||
996 | * will be spooled but will not show up on the console. This function is | ||
997 | * called when a new CPU comes online (or fails to come up), and ensures | ||
998 | * that any such output gets printed. | ||
999 | */ | ||
1000 | static int __cpuinit console_cpu_notify(struct notifier_block *self, | ||
1001 | unsigned long action, void *hcpu) | ||
1002 | { | ||
1003 | switch (action) { | ||
1004 | case CPU_ONLINE: | ||
1005 | case CPU_DEAD: | ||
1006 | case CPU_DYING: | ||
1007 | case CPU_DOWN_FAILED: | ||
1008 | case CPU_UP_CANCELED: | ||
1009 | acquire_console_sem(); | ||
1010 | release_console_sem(); | ||
1011 | } | ||
1012 | return NOTIFY_OK; | ||
1013 | } | ||
1014 | |||
1015 | /** | ||
988 | * acquire_console_sem - lock the console system for exclusive use. | 1016 | * acquire_console_sem - lock the console system for exclusive use. |
989 | * | 1017 | * |
990 | * Acquires a semaphore which guarantees that the caller has | 1018 | * Acquires a semaphore which guarantees that the caller has |
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console) | |||
1371 | } | 1399 | } |
1372 | EXPORT_SYMBOL(unregister_console); | 1400 | EXPORT_SYMBOL(unregister_console); |
1373 | 1401 | ||
1374 | static int __init disable_boot_consoles(void) | 1402 | static int __init printk_late_init(void) |
1375 | { | 1403 | { |
1376 | struct console *con; | 1404 | struct console *con; |
1377 | 1405 | ||
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void) | |||
1382 | unregister_console(con); | 1410 | unregister_console(con); |
1383 | } | 1411 | } |
1384 | } | 1412 | } |
1413 | hotcpu_notifier(console_cpu_notify, 0); | ||
1385 | return 0; | 1414 | return 0; |
1386 | } | 1415 | } |
1387 | late_initcall(disable_boot_consoles); | 1416 | late_initcall(printk_late_init); |
1388 | 1417 | ||
1389 | #if defined CONFIG_PRINTK | 1418 | #if defined CONFIG_PRINTK |
1390 | 1419 | ||
@@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1520 | chars = logged_chars; | 1549 | chars = logged_chars; |
1521 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1550 | spin_unlock_irqrestore(&logbuf_lock, flags); |
1522 | 1551 | ||
1523 | if (logged_chars > end) { | 1552 | if (chars > end) { |
1524 | s1 = log_buf + log_buf_len - logged_chars + end; | 1553 | s1 = log_buf + log_buf_len - chars + end; |
1525 | l1 = logged_chars - end; | 1554 | l1 = chars - end; |
1526 | 1555 | ||
1527 | s2 = log_buf; | 1556 | s2 = log_buf; |
1528 | l2 = end; | 1557 | l2 = end; |
@@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1530 | s1 = ""; | 1559 | s1 = ""; |
1531 | l1 = 0; | 1560 | l1 = 0; |
1532 | 1561 | ||
1533 | s2 = log_buf + end - logged_chars; | 1562 | s2 = log_buf + end - chars; |
1534 | l2 = logged_chars; | 1563 | l2 = chars; |
1535 | } | 1564 | } |
1536 | 1565 | ||
1537 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { | 1566 | if (!spin_trylock_irqsave(&dump_list_lock, flags)) { |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 74a3d693c196..f34d798ef4a2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
327 | * Detach all tasks we were using ptrace on. | 327 | * Detach all tasks we were using ptrace on. Called with tasklist held |
328 | * for writing, and returns with it held too. But note it can release | ||
329 | * and reacquire the lock. | ||
328 | */ | 330 | */ |
329 | void exit_ptrace(struct task_struct *tracer) | 331 | void exit_ptrace(struct task_struct *tracer) |
330 | { | 332 | { |
331 | struct task_struct *p, *n; | 333 | struct task_struct *p, *n; |
332 | LIST_HEAD(ptrace_dead); | 334 | LIST_HEAD(ptrace_dead); |
333 | 335 | ||
334 | write_lock_irq(&tasklist_lock); | 336 | if (likely(list_empty(&tracer->ptraced))) |
337 | return; | ||
338 | |||
335 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { | 339 | list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { |
336 | if (__ptrace_detach(tracer, p)) | 340 | if (__ptrace_detach(tracer, p)) |
337 | list_add(&p->ptrace_entry, &ptrace_dead); | 341 | list_add(&p->ptrace_entry, &ptrace_dead); |
338 | } | 342 | } |
339 | write_unlock_irq(&tasklist_lock); | ||
340 | 343 | ||
344 | write_unlock_irq(&tasklist_lock); | ||
341 | BUG_ON(!list_empty(&tracer->ptraced)); | 345 | BUG_ON(!list_empty(&tracer->ptraced)); |
342 | 346 | ||
343 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { | 347 | list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { |
344 | list_del_init(&p->ptrace_entry); | 348 | list_del_init(&p->ptrace_entry); |
345 | release_task(p); | 349 | release_task(p); |
346 | } | 350 | } |
351 | |||
352 | write_lock_irq(&tasklist_lock); | ||
347 | } | 353 | } |
348 | 354 | ||
349 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 355 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
diff --git a/kernel/range.c b/kernel/range.c index 74e2e6114927..471b66acabb5 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -7,10 +7,6 @@ | |||
7 | 7 | ||
8 | #include <linux/range.h> | 8 | #include <linux/range.h> |
9 | 9 | ||
10 | #ifndef ARRAY_SIZE | ||
11 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
12 | #endif | ||
13 | |||
14 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) | 10 | int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) |
15 | { | 11 | { |
16 | if (start >= end) | 12 | if (start >= end) |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f5..4d169835fb36 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) | |||
114 | } | 114 | } |
115 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | 115 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); |
116 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 116 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
117 | |||
118 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
119 | static inline void debug_init_rcu_head(struct rcu_head *head) | ||
120 | { | ||
121 | debug_object_init(head, &rcuhead_debug_descr); | ||
122 | } | ||
123 | |||
124 | static inline void debug_rcu_head_free(struct rcu_head *head) | ||
125 | { | ||
126 | debug_object_free(head, &rcuhead_debug_descr); | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * fixup_init is called when: | ||
131 | * - an active object is initialized | ||
132 | */ | ||
133 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
134 | { | ||
135 | struct rcu_head *head = addr; | ||
136 | |||
137 | switch (state) { | ||
138 | case ODEBUG_STATE_ACTIVE: | ||
139 | /* | ||
140 | * Ensure that queued callbacks are all executed. | ||
141 | * If we detect that we are nested in a RCU read-side critical | ||
142 | * section, we should simply fail, otherwise we would deadlock. | ||
143 | */ | ||
144 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
145 | irqs_disabled()) { | ||
146 | WARN_ON(1); | ||
147 | return 0; | ||
148 | } | ||
149 | rcu_barrier(); | ||
150 | rcu_barrier_sched(); | ||
151 | rcu_barrier_bh(); | ||
152 | debug_object_init(head, &rcuhead_debug_descr); | ||
153 | return 1; | ||
154 | default: | ||
155 | return 0; | ||
156 | } | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * fixup_activate is called when: | ||
161 | * - an active object is activated | ||
162 | * - an unknown object is activated (might be a statically initialized object) | ||
163 | * Activation is performed internally by call_rcu(). | ||
164 | */ | ||
165 | static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | ||
166 | { | ||
167 | struct rcu_head *head = addr; | ||
168 | |||
169 | switch (state) { | ||
170 | |||
171 | case ODEBUG_STATE_NOTAVAILABLE: | ||
172 | /* | ||
173 | * This is not really a fixup. We just make sure that it is | ||
174 | * tracked in the object tracker. | ||
175 | */ | ||
176 | debug_object_init(head, &rcuhead_debug_descr); | ||
177 | debug_object_activate(head, &rcuhead_debug_descr); | ||
178 | return 0; | ||
179 | |||
180 | case ODEBUG_STATE_ACTIVE: | ||
181 | /* | ||
182 | * Ensure that queued callbacks are all executed. | ||
183 | * If we detect that we are nested in a RCU read-side critical | ||
184 | * section, we should simply fail, otherwise we would deadlock. | ||
185 | */ | ||
186 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
187 | irqs_disabled()) { | ||
188 | WARN_ON(1); | ||
189 | return 0; | ||
190 | } | ||
191 | rcu_barrier(); | ||
192 | rcu_barrier_sched(); | ||
193 | rcu_barrier_bh(); | ||
194 | debug_object_activate(head, &rcuhead_debug_descr); | ||
195 | return 1; | ||
196 | default: | ||
197 | return 0; | ||
198 | } | ||
199 | } | ||
200 | |||
201 | /* | ||
202 | * fixup_free is called when: | ||
203 | * - an active object is freed | ||
204 | */ | ||
205 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
206 | { | ||
207 | struct rcu_head *head = addr; | ||
208 | |||
209 | switch (state) { | ||
210 | case ODEBUG_STATE_ACTIVE: | ||
211 | /* | ||
212 | * Ensure that queued callbacks are all executed. | ||
213 | * If we detect that we are nested in a RCU read-side critical | ||
214 | * section, we should simply fail, otherwise we would deadlock. | ||
215 | */ | ||
216 | #ifndef CONFIG_PREEMPT | ||
217 | WARN_ON(1); | ||
218 | return 0; | ||
219 | #else | ||
220 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
221 | irqs_disabled()) { | ||
222 | WARN_ON(1); | ||
223 | return 0; | ||
224 | } | ||
225 | rcu_barrier(); | ||
226 | rcu_barrier_sched(); | ||
227 | rcu_barrier_bh(); | ||
228 | debug_object_free(head, &rcuhead_debug_descr); | ||
229 | return 1; | ||
230 | #endif | ||
231 | default: | ||
232 | return 0; | ||
233 | } | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects | ||
238 | * @head: pointer to rcu_head structure to be initialized | ||
239 | * | ||
240 | * This function informs debugobjects of a new rcu_head structure that | ||
241 | * has been allocated as an auto variable on the stack. This function | ||
242 | * is not required for rcu_head structures that are statically defined or | ||
243 | * that are dynamically allocated on the heap. This function has no | ||
244 | * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
245 | */ | ||
246 | void init_rcu_head_on_stack(struct rcu_head *head) | ||
247 | { | ||
248 | debug_object_init_on_stack(head, &rcuhead_debug_descr); | ||
249 | } | ||
250 | EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); | ||
251 | |||
252 | /** | ||
253 | * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects | ||
254 | * @head: pointer to rcu_head structure to be initialized | ||
255 | * | ||
256 | * This function informs debugobjects that an on-stack rcu_head structure | ||
257 | * is about to go out of scope. As with init_rcu_head_on_stack(), this | ||
258 | * function is not required for rcu_head structures that are statically | ||
259 | * defined or that are dynamically allocated on the heap. Also as with | ||
260 | * init_rcu_head_on_stack(), this function has no effect for | ||
261 | * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
262 | */ | ||
263 | void destroy_rcu_head_on_stack(struct rcu_head *head) | ||
264 | { | ||
265 | debug_object_free(head, &rcuhead_debug_descr); | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | ||
268 | |||
269 | struct debug_obj_descr rcuhead_debug_descr = { | ||
270 | .name = "rcu_head", | ||
271 | .fixup_init = rcuhead_fixup_init, | ||
272 | .fixup_activate = rcuhead_fixup_activate, | ||
273 | .fixup_free = rcuhead_fixup_free, | ||
274 | }; | ||
275 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | ||
276 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd236..196ec02f8be0 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
169 | while (list) { | 169 | while (list) { |
170 | next = list->next; | 170 | next = list->next; |
171 | prefetch(next); | 171 | prefetch(next); |
172 | debug_rcu_head_unqueue(list); | ||
172 | list->func(list); | 173 | list->func(list); |
173 | list = next; | 174 | list = next; |
174 | } | 175 | } |
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, | |||
211 | { | 212 | { |
212 | unsigned long flags; | 213 | unsigned long flags; |
213 | 214 | ||
215 | debug_rcu_head_queue(head); | ||
214 | head->func = func; | 216 | head->func = func; |
215 | head->next = NULL; | 217 | head->next = NULL; |
216 | 218 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a5..2e2726d790b9 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -239,8 +239,7 @@ static unsigned long | |||
239 | rcu_random(struct rcu_random_state *rrsp) | 239 | rcu_random(struct rcu_random_state *rrsp) |
240 | { | 240 | { |
241 | if (--rrsp->rrs_count < 0) { | 241 | if (--rrsp->rrs_count < 0) { |
242 | rrsp->rrs_state += | 242 | rrsp->rrs_state += (unsigned long)local_clock(); |
243 | (unsigned long)cpu_clock(raw_smp_processor_id()); | ||
244 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | 243 | rrsp->rrs_count = RCU_RANDOM_REFRESH; |
245 | } | 244 | } |
246 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | 245 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706f..d5bc43976c5a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1112 | while (list) { | 1112 | while (list) { |
1113 | next = list->next; | 1113 | next = list->next; |
1114 | prefetch(next); | 1114 | prefetch(next); |
1115 | debug_rcu_head_unqueue(list); | ||
1115 | list->func(list); | 1116 | list->func(list); |
1116 | list = next; | 1117 | list = next; |
1117 | if (++count >= rdp->blimit) | 1118 | if (++count >= rdp->blimit) |
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1388 | unsigned long flags; | 1389 | unsigned long flags; |
1389 | struct rcu_data *rdp; | 1390 | struct rcu_data *rdp; |
1390 | 1391 | ||
1392 | debug_rcu_head_queue(head); | ||
1391 | head->func = func; | 1393 | head->func = func; |
1392 | head->next = NULL; | 1394 | head->next = NULL; |
1393 | 1395 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a2..dc85ceb90832 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -77,6 +77,7 @@ | |||
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | 78 | ||
79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | ||
80 | 81 | ||
81 | #define CREATE_TRACE_POINTS | 82 | #define CREATE_TRACE_POINTS |
82 | #include <trace/events/sched.h> | 83 | #include <trace/events/sched.h> |
@@ -456,9 +457,10 @@ struct rq { | |||
456 | unsigned long nr_running; | 457 | unsigned long nr_running; |
457 | #define CPU_LOAD_IDX_MAX 5 | 458 | #define CPU_LOAD_IDX_MAX 5 |
458 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 459 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
460 | unsigned long last_load_update_tick; | ||
459 | #ifdef CONFIG_NO_HZ | 461 | #ifdef CONFIG_NO_HZ |
460 | u64 nohz_stamp; | 462 | u64 nohz_stamp; |
461 | unsigned char in_nohz_recently; | 463 | unsigned char nohz_balance_kick; |
462 | #endif | 464 | #endif |
463 | unsigned int skip_clock_update; | 465 | unsigned int skip_clock_update; |
464 | 466 | ||
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) | |||
1193 | 1195 | ||
1194 | #ifdef CONFIG_NO_HZ | 1196 | #ifdef CONFIG_NO_HZ |
1195 | /* | 1197 | /* |
1198 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
1199 | * from an idle cpu. This is good for power-savings. | ||
1200 | * | ||
1201 | * We don't do similar optimization for completely idle system, as | ||
1202 | * selecting an idle cpu will add more delays to the timers than intended | ||
1203 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
1204 | */ | ||
1205 | int get_nohz_timer_target(void) | ||
1206 | { | ||
1207 | int cpu = smp_processor_id(); | ||
1208 | int i; | ||
1209 | struct sched_domain *sd; | ||
1210 | |||
1211 | for_each_domain(cpu, sd) { | ||
1212 | for_each_cpu(i, sched_domain_span(sd)) | ||
1213 | if (!idle_cpu(i)) | ||
1214 | return i; | ||
1215 | } | ||
1216 | return cpu; | ||
1217 | } | ||
1218 | /* | ||
1196 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1219 | * When add_timer_on() enqueues a timer into the timer wheel of an |
1197 | * idle CPU then this timer might expire before the next timer event | 1220 | * idle CPU then this timer might expire before the next timer event |
1198 | * which is scheduled to wake up that CPU. In case of a completely | 1221 | * which is scheduled to wake up that CPU. In case of a completely |
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu) | |||
1232 | smp_send_reschedule(cpu); | 1255 | smp_send_reschedule(cpu); |
1233 | } | 1256 | } |
1234 | 1257 | ||
1235 | int nohz_ratelimit(int cpu) | ||
1236 | { | ||
1237 | struct rq *rq = cpu_rq(cpu); | ||
1238 | u64 diff = rq->clock - rq->nohz_stamp; | ||
1239 | |||
1240 | rq->nohz_stamp = rq->clock; | ||
1241 | |||
1242 | return diff < (NSEC_PER_SEC / HZ) >> 1; | ||
1243 | } | ||
1244 | |||
1245 | #endif /* CONFIG_NO_HZ */ | 1258 | #endif /* CONFIG_NO_HZ */ |
1246 | 1259 | ||
1247 | static u64 sched_avg_period(void) | 1260 | static u64 sched_avg_period(void) |
@@ -1281,6 +1294,10 @@ static void resched_task(struct task_struct *p) | |||
1281 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1294 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1282 | { | 1295 | { |
1283 | } | 1296 | } |
1297 | |||
1298 | static void sched_avg_update(struct rq *rq) | ||
1299 | { | ||
1300 | } | ||
1284 | #endif /* CONFIG_SMP */ | 1301 | #endif /* CONFIG_SMP */ |
1285 | 1302 | ||
1286 | #if BITS_PER_LONG == 32 | 1303 | #if BITS_PER_LONG == 32 |
@@ -1652,7 +1669,7 @@ static void update_shares(struct sched_domain *sd) | |||
1652 | if (root_task_group_empty()) | 1669 | if (root_task_group_empty()) |
1653 | return; | 1670 | return; |
1654 | 1671 | ||
1655 | now = cpu_clock(raw_smp_processor_id()); | 1672 | now = local_clock(); |
1656 | elapsed = now - sd->last_update; | 1673 | elapsed = now - sd->last_update; |
1657 | 1674 | ||
1658 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1675 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
@@ -1805,6 +1822,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1805 | static void calc_load_account_idle(struct rq *this_rq); | 1822 | static void calc_load_account_idle(struct rq *this_rq); |
1806 | static void update_sysctl(void); | 1823 | static void update_sysctl(void); |
1807 | static int get_update_sysctl_factor(void); | 1824 | static int get_update_sysctl_factor(void); |
1825 | static void update_cpu_load(struct rq *this_rq); | ||
1808 | 1826 | ||
1809 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1827 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1810 | { | 1828 | { |
@@ -2267,11 +2285,55 @@ static void update_avg(u64 *avg, u64 sample) | |||
2267 | } | 2285 | } |
2268 | #endif | 2286 | #endif |
2269 | 2287 | ||
2270 | /*** | 2288 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, |
2289 | bool is_sync, bool is_migrate, bool is_local, | ||
2290 | unsigned long en_flags) | ||
2291 | { | ||
2292 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
2293 | if (is_sync) | ||
2294 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2295 | if (is_migrate) | ||
2296 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2297 | if (is_local) | ||
2298 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2299 | else | ||
2300 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2301 | |||
2302 | activate_task(rq, p, en_flags); | ||
2303 | } | ||
2304 | |||
2305 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | ||
2306 | int wake_flags, bool success) | ||
2307 | { | ||
2308 | trace_sched_wakeup(p, success); | ||
2309 | check_preempt_curr(rq, p, wake_flags); | ||
2310 | |||
2311 | p->state = TASK_RUNNING; | ||
2312 | #ifdef CONFIG_SMP | ||
2313 | if (p->sched_class->task_woken) | ||
2314 | p->sched_class->task_woken(rq, p); | ||
2315 | |||
2316 | if (unlikely(rq->idle_stamp)) { | ||
2317 | u64 delta = rq->clock - rq->idle_stamp; | ||
2318 | u64 max = 2*sysctl_sched_migration_cost; | ||
2319 | |||
2320 | if (delta > max) | ||
2321 | rq->avg_idle = max; | ||
2322 | else | ||
2323 | update_avg(&rq->avg_idle, delta); | ||
2324 | rq->idle_stamp = 0; | ||
2325 | } | ||
2326 | #endif | ||
2327 | /* if a worker is waking up, notify workqueue */ | ||
2328 | if ((p->flags & PF_WQ_WORKER) && success) | ||
2329 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2330 | } | ||
2331 | |||
2332 | /** | ||
2271 | * try_to_wake_up - wake up a thread | 2333 | * try_to_wake_up - wake up a thread |
2272 | * @p: the to-be-woken-up thread | 2334 | * @p: the thread to be awakened |
2273 | * @state: the mask of task states that can be woken | 2335 | * @state: the mask of task states that can be woken |
2274 | * @sync: do a synchronous wakeup? | 2336 | * @wake_flags: wake modifier flags (WF_*) |
2275 | * | 2337 | * |
2276 | * Put it on the run-queue if it's not already there. The "current" | 2338 | * Put it on the run-queue if it's not already there. The "current" |
2277 | * thread is always on the run-queue (except when the actual | 2339 | * thread is always on the run-queue (except when the actual |
@@ -2279,7 +2341,8 @@ static void update_avg(u64 *avg, u64 sample) | |||
2279 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 2341 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
2280 | * runnable without the overhead of this. | 2342 | * runnable without the overhead of this. |
2281 | * | 2343 | * |
2282 | * returns failure only if the task is already active. | 2344 | * Returns %true if @p was woken up, %false if it was already running |
2345 | * or @state didn't match @p's state. | ||
2283 | */ | 2346 | */ |
2284 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2347 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2285 | int wake_flags) | 2348 | int wake_flags) |
@@ -2359,38 +2422,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
2359 | 2422 | ||
2360 | out_activate: | 2423 | out_activate: |
2361 | #endif /* CONFIG_SMP */ | 2424 | #endif /* CONFIG_SMP */ |
2362 | schedstat_inc(p, se.statistics.nr_wakeups); | 2425 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, |
2363 | if (wake_flags & WF_SYNC) | 2426 | cpu == this_cpu, en_flags); |
2364 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
2365 | if (orig_cpu != cpu) | ||
2366 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
2367 | if (cpu == this_cpu) | ||
2368 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2369 | else | ||
2370 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2371 | activate_task(rq, p, en_flags); | ||
2372 | success = 1; | 2427 | success = 1; |
2373 | |||
2374 | out_running: | 2428 | out_running: |
2375 | trace_sched_wakeup(p, success); | 2429 | ttwu_post_activation(p, rq, wake_flags, success); |
2376 | check_preempt_curr(rq, p, wake_flags); | ||
2377 | |||
2378 | p->state = TASK_RUNNING; | ||
2379 | #ifdef CONFIG_SMP | ||
2380 | if (p->sched_class->task_woken) | ||
2381 | p->sched_class->task_woken(rq, p); | ||
2382 | |||
2383 | if (unlikely(rq->idle_stamp)) { | ||
2384 | u64 delta = rq->clock - rq->idle_stamp; | ||
2385 | u64 max = 2*sysctl_sched_migration_cost; | ||
2386 | |||
2387 | if (delta > max) | ||
2388 | rq->avg_idle = max; | ||
2389 | else | ||
2390 | update_avg(&rq->avg_idle, delta); | ||
2391 | rq->idle_stamp = 0; | ||
2392 | } | ||
2393 | #endif | ||
2394 | out: | 2430 | out: |
2395 | task_rq_unlock(rq, &flags); | 2431 | task_rq_unlock(rq, &flags); |
2396 | put_cpu(); | 2432 | put_cpu(); |
@@ -2399,6 +2435,37 @@ out: | |||
2399 | } | 2435 | } |
2400 | 2436 | ||
2401 | /** | 2437 | /** |
2438 | * try_to_wake_up_local - try to wake up a local task with rq lock held | ||
2439 | * @p: the thread to be awakened | ||
2440 | * | ||
2441 | * Put @p on the run-queue if it's not alredy there. The caller must | ||
2442 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | ||
2443 | * the current task. this_rq() stays locked over invocation. | ||
2444 | */ | ||
2445 | static void try_to_wake_up_local(struct task_struct *p) | ||
2446 | { | ||
2447 | struct rq *rq = task_rq(p); | ||
2448 | bool success = false; | ||
2449 | |||
2450 | BUG_ON(rq != this_rq()); | ||
2451 | BUG_ON(p == current); | ||
2452 | lockdep_assert_held(&rq->lock); | ||
2453 | |||
2454 | if (!(p->state & TASK_NORMAL)) | ||
2455 | return; | ||
2456 | |||
2457 | if (!p->se.on_rq) { | ||
2458 | if (likely(!task_running(rq, p))) { | ||
2459 | schedstat_inc(rq, ttwu_count); | ||
2460 | schedstat_inc(rq, ttwu_local); | ||
2461 | } | ||
2462 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | ||
2463 | success = true; | ||
2464 | } | ||
2465 | ttwu_post_activation(p, rq, 0, success); | ||
2466 | } | ||
2467 | |||
2468 | /** | ||
2402 | * wake_up_process - Wake up a specific process | 2469 | * wake_up_process - Wake up a specific process |
2403 | * @p: The process to be woken up. | 2470 | * @p: The process to be woken up. |
2404 | * | 2471 | * |
@@ -3012,23 +3079,102 @@ static void calc_load_account_active(struct rq *this_rq) | |||
3012 | } | 3079 | } |
3013 | 3080 | ||
3014 | /* | 3081 | /* |
3082 | * The exact cpuload at various idx values, calculated at every tick would be | ||
3083 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
3084 | * | ||
3085 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
3086 | * on nth tick when cpu may be busy, then we have: | ||
3087 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
3088 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
3089 | * | ||
3090 | * decay_load_missed() below does efficient calculation of | ||
3091 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
3092 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
3093 | * | ||
3094 | * The calculation is approximated on a 128 point scale. | ||
3095 | * degrade_zero_ticks is the number of ticks after which load at any | ||
3096 | * particular idx is approximated to be zero. | ||
3097 | * degrade_factor is a precomputed table, a row for each load idx. | ||
3098 | * Each column corresponds to degradation factor for a power of two ticks, | ||
3099 | * based on 128 point scale. | ||
3100 | * Example: | ||
3101 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
3102 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
3103 | * | ||
3104 | * With this power of 2 load factors, we can degrade the load n times | ||
3105 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
3106 | * n mult/shifts needed by the exact degradation. | ||
3107 | */ | ||
3108 | #define DEGRADE_SHIFT 7 | ||
3109 | static const unsigned char | ||
3110 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
3111 | static const unsigned char | ||
3112 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
3113 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
3114 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
3115 | {96, 72, 40, 12, 1, 0, 0}, | ||
3116 | {112, 98, 75, 43, 15, 1, 0}, | ||
3117 | {120, 112, 98, 76, 45, 16, 2} }; | ||
3118 | |||
3119 | /* | ||
3120 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
3121 | * would be when CPU is idle and so we just decay the old load without | ||
3122 | * adding any new load. | ||
3123 | */ | ||
3124 | static unsigned long | ||
3125 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
3126 | { | ||
3127 | int j = 0; | ||
3128 | |||
3129 | if (!missed_updates) | ||
3130 | return load; | ||
3131 | |||
3132 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
3133 | return 0; | ||
3134 | |||
3135 | if (idx == 1) | ||
3136 | return load >> missed_updates; | ||
3137 | |||
3138 | while (missed_updates) { | ||
3139 | if (missed_updates % 2) | ||
3140 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
3141 | |||
3142 | missed_updates >>= 1; | ||
3143 | j++; | ||
3144 | } | ||
3145 | return load; | ||
3146 | } | ||
3147 | |||
3148 | /* | ||
3015 | * Update rq->cpu_load[] statistics. This function is usually called every | 3149 | * Update rq->cpu_load[] statistics. This function is usually called every |
3016 | * scheduler tick (TICK_NSEC). | 3150 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3151 | * every tick. We fix it up based on jiffies. | ||
3017 | */ | 3152 | */ |
3018 | static void update_cpu_load(struct rq *this_rq) | 3153 | static void update_cpu_load(struct rq *this_rq) |
3019 | { | 3154 | { |
3020 | unsigned long this_load = this_rq->load.weight; | 3155 | unsigned long this_load = this_rq->load.weight; |
3156 | unsigned long curr_jiffies = jiffies; | ||
3157 | unsigned long pending_updates; | ||
3021 | int i, scale; | 3158 | int i, scale; |
3022 | 3159 | ||
3023 | this_rq->nr_load_updates++; | 3160 | this_rq->nr_load_updates++; |
3024 | 3161 | ||
3162 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
3163 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
3164 | return; | ||
3165 | |||
3166 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
3167 | this_rq->last_load_update_tick = curr_jiffies; | ||
3168 | |||
3025 | /* Update our load: */ | 3169 | /* Update our load: */ |
3026 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 3170 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
3171 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
3027 | unsigned long old_load, new_load; | 3172 | unsigned long old_load, new_load; |
3028 | 3173 | ||
3029 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 3174 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
3030 | 3175 | ||
3031 | old_load = this_rq->cpu_load[i]; | 3176 | old_load = this_rq->cpu_load[i]; |
3177 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
3032 | new_load = this_load; | 3178 | new_load = this_load; |
3033 | /* | 3179 | /* |
3034 | * Round up the averaging division if load is increasing. This | 3180 | * Round up the averaging division if load is increasing. This |
@@ -3036,10 +3182,18 @@ static void update_cpu_load(struct rq *this_rq) | |||
3036 | * example. | 3182 | * example. |
3037 | */ | 3183 | */ |
3038 | if (new_load > old_load) | 3184 | if (new_load > old_load) |
3039 | new_load += scale-1; | 3185 | new_load += scale - 1; |
3040 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3186 | |
3187 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
3041 | } | 3188 | } |
3042 | 3189 | ||
3190 | sched_avg_update(this_rq); | ||
3191 | } | ||
3192 | |||
3193 | static void update_cpu_load_active(struct rq *this_rq) | ||
3194 | { | ||
3195 | update_cpu_load(this_rq); | ||
3196 | |||
3043 | calc_load_account_active(this_rq); | 3197 | calc_load_account_active(this_rq); |
3044 | } | 3198 | } |
3045 | 3199 | ||
@@ -3359,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3359 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 3513 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
3360 | 3514 | ||
3361 | if (total) { | 3515 | if (total) { |
3362 | u64 temp; | 3516 | u64 temp = rtime; |
3363 | 3517 | ||
3364 | temp = (u64)(rtime * utime); | 3518 | temp *= utime; |
3365 | do_div(temp, total); | 3519 | do_div(temp, total); |
3366 | utime = (cputime_t)temp; | 3520 | utime = (cputime_t)temp; |
3367 | } else | 3521 | } else |
@@ -3392,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3392 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 3546 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
3393 | 3547 | ||
3394 | if (total) { | 3548 | if (total) { |
3395 | u64 temp; | 3549 | u64 temp = rtime; |
3396 | 3550 | ||
3397 | temp = (u64)(rtime * cputime.utime); | 3551 | temp *= cputime.utime; |
3398 | do_div(temp, total); | 3552 | do_div(temp, total); |
3399 | utime = (cputime_t)temp; | 3553 | utime = (cputime_t)temp; |
3400 | } else | 3554 | } else |
@@ -3426,7 +3580,7 @@ void scheduler_tick(void) | |||
3426 | 3580 | ||
3427 | raw_spin_lock(&rq->lock); | 3581 | raw_spin_lock(&rq->lock); |
3428 | update_rq_clock(rq); | 3582 | update_rq_clock(rq); |
3429 | update_cpu_load(rq); | 3583 | update_cpu_load_active(rq); |
3430 | curr->sched_class->task_tick(rq, curr, 0); | 3584 | curr->sched_class->task_tick(rq, curr, 0); |
3431 | raw_spin_unlock(&rq->lock); | 3585 | raw_spin_unlock(&rq->lock); |
3432 | 3586 | ||
@@ -3598,7 +3752,6 @@ need_resched: | |||
3598 | rq = cpu_rq(cpu); | 3752 | rq = cpu_rq(cpu); |
3599 | rcu_note_context_switch(cpu); | 3753 | rcu_note_context_switch(cpu); |
3600 | prev = rq->curr; | 3754 | prev = rq->curr; |
3601 | switch_count = &prev->nivcsw; | ||
3602 | 3755 | ||
3603 | release_kernel_lock(prev); | 3756 | release_kernel_lock(prev); |
3604 | need_resched_nonpreemptible: | 3757 | need_resched_nonpreemptible: |
@@ -3611,11 +3764,26 @@ need_resched_nonpreemptible: | |||
3611 | raw_spin_lock_irq(&rq->lock); | 3764 | raw_spin_lock_irq(&rq->lock); |
3612 | clear_tsk_need_resched(prev); | 3765 | clear_tsk_need_resched(prev); |
3613 | 3766 | ||
3767 | switch_count = &prev->nivcsw; | ||
3614 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3768 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3615 | if (unlikely(signal_pending_state(prev->state, prev))) | 3769 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3616 | prev->state = TASK_RUNNING; | 3770 | prev->state = TASK_RUNNING; |
3617 | else | 3771 | } else { |
3772 | /* | ||
3773 | * If a worker is going to sleep, notify and | ||
3774 | * ask workqueue whether it wants to wake up a | ||
3775 | * task to maintain concurrency. If so, wake | ||
3776 | * up the task. | ||
3777 | */ | ||
3778 | if (prev->flags & PF_WQ_WORKER) { | ||
3779 | struct task_struct *to_wakeup; | ||
3780 | |||
3781 | to_wakeup = wq_worker_sleeping(prev, cpu); | ||
3782 | if (to_wakeup) | ||
3783 | try_to_wake_up_local(to_wakeup); | ||
3784 | } | ||
3618 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3785 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
3786 | } | ||
3619 | switch_count = &prev->nvcsw; | 3787 | switch_count = &prev->nvcsw; |
3620 | } | 3788 | } |
3621 | 3789 | ||
@@ -3637,8 +3805,10 @@ need_resched_nonpreemptible: | |||
3637 | 3805 | ||
3638 | context_switch(rq, prev, next); /* unlocks the rq */ | 3806 | context_switch(rq, prev, next); /* unlocks the rq */ |
3639 | /* | 3807 | /* |
3640 | * the context switch might have flipped the stack from under | 3808 | * The context switch have flipped the stack from under us |
3641 | * us, hence refresh the local variables. | 3809 | * and restored the local variables which were saved when |
3810 | * this task called schedule() in the past. prev == current | ||
3811 | * is still correct, but it can be moved to another cpu/rq. | ||
3642 | */ | 3812 | */ |
3643 | cpu = smp_processor_id(); | 3813 | cpu = smp_processor_id(); |
3644 | rq = cpu_rq(cpu); | 3814 | rq = cpu_rq(cpu); |
@@ -3647,11 +3817,8 @@ need_resched_nonpreemptible: | |||
3647 | 3817 | ||
3648 | post_schedule(rq); | 3818 | post_schedule(rq); |
3649 | 3819 | ||
3650 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3820 | if (unlikely(reacquire_kernel_lock(prev))) |
3651 | prev = rq->curr; | ||
3652 | switch_count = &prev->nivcsw; | ||
3653 | goto need_resched_nonpreemptible; | 3821 | goto need_resched_nonpreemptible; |
3654 | } | ||
3655 | 3822 | ||
3656 | preempt_enable_no_resched(); | 3823 | preempt_enable_no_resched(); |
3657 | if (need_resched()) | 3824 | if (need_resched()) |
@@ -3704,8 +3871,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
3704 | /* | 3871 | /* |
3705 | * Owner changed, break to re-assess state. | 3872 | * Owner changed, break to re-assess state. |
3706 | */ | 3873 | */ |
3707 | if (lock->owner != owner) | 3874 | if (lock->owner != owner) { |
3875 | /* | ||
3876 | * If the lock has switched to a different owner, | ||
3877 | * we likely have heavy contention. Return 0 to quit | ||
3878 | * optimistic spinning and not contend further: | ||
3879 | */ | ||
3880 | if (lock->owner) | ||
3881 | return 0; | ||
3708 | break; | 3882 | break; |
3883 | } | ||
3709 | 3884 | ||
3710 | /* | 3885 | /* |
3711 | * Is that owner really running on that cpu? | 3886 | * Is that owner really running on that cpu? |
@@ -3726,7 +3901,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
3726 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3901 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3727 | * occur there and call schedule directly. | 3902 | * occur there and call schedule directly. |
3728 | */ | 3903 | */ |
3729 | asmlinkage void __sched preempt_schedule(void) | 3904 | asmlinkage void __sched notrace preempt_schedule(void) |
3730 | { | 3905 | { |
3731 | struct thread_info *ti = current_thread_info(); | 3906 | struct thread_info *ti = current_thread_info(); |
3732 | 3907 | ||
@@ -3738,9 +3913,9 @@ asmlinkage void __sched preempt_schedule(void) | |||
3738 | return; | 3913 | return; |
3739 | 3914 | ||
3740 | do { | 3915 | do { |
3741 | add_preempt_count(PREEMPT_ACTIVE); | 3916 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
3742 | schedule(); | 3917 | schedule(); |
3743 | sub_preempt_count(PREEMPT_ACTIVE); | 3918 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
3744 | 3919 | ||
3745 | /* | 3920 | /* |
3746 | * Check again in case we missed a preemption opportunity | 3921 | * Check again in case we missed a preemption opportunity |
@@ -4441,12 +4616,8 @@ recheck: | |||
4441 | */ | 4616 | */ |
4442 | if (user && !capable(CAP_SYS_NICE)) { | 4617 | if (user && !capable(CAP_SYS_NICE)) { |
4443 | if (rt_policy(policy)) { | 4618 | if (rt_policy(policy)) { |
4444 | unsigned long rlim_rtprio; | 4619 | unsigned long rlim_rtprio = |
4445 | 4620 | task_rlimit(p, RLIMIT_RTPRIO); | |
4446 | if (!lock_task_sighand(p, &flags)) | ||
4447 | return -ESRCH; | ||
4448 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); | ||
4449 | unlock_task_sighand(p, &flags); | ||
4450 | 4621 | ||
4451 | /* can't set/change the rt policy */ | 4622 | /* can't set/change the rt policy */ |
4452 | if (policy != p->policy && !rlim_rtprio) | 4623 | if (policy != p->policy && !rlim_rtprio) |
@@ -5816,20 +5987,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5816 | */ | 5987 | */ |
5817 | static struct notifier_block __cpuinitdata migration_notifier = { | 5988 | static struct notifier_block __cpuinitdata migration_notifier = { |
5818 | .notifier_call = migration_call, | 5989 | .notifier_call = migration_call, |
5819 | .priority = 10 | 5990 | .priority = CPU_PRI_MIGRATION, |
5820 | }; | 5991 | }; |
5821 | 5992 | ||
5993 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | ||
5994 | unsigned long action, void *hcpu) | ||
5995 | { | ||
5996 | switch (action & ~CPU_TASKS_FROZEN) { | ||
5997 | case CPU_ONLINE: | ||
5998 | case CPU_DOWN_FAILED: | ||
5999 | set_cpu_active((long)hcpu, true); | ||
6000 | return NOTIFY_OK; | ||
6001 | default: | ||
6002 | return NOTIFY_DONE; | ||
6003 | } | ||
6004 | } | ||
6005 | |||
6006 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | ||
6007 | unsigned long action, void *hcpu) | ||
6008 | { | ||
6009 | switch (action & ~CPU_TASKS_FROZEN) { | ||
6010 | case CPU_DOWN_PREPARE: | ||
6011 | set_cpu_active((long)hcpu, false); | ||
6012 | return NOTIFY_OK; | ||
6013 | default: | ||
6014 | return NOTIFY_DONE; | ||
6015 | } | ||
6016 | } | ||
6017 | |||
5822 | static int __init migration_init(void) | 6018 | static int __init migration_init(void) |
5823 | { | 6019 | { |
5824 | void *cpu = (void *)(long)smp_processor_id(); | 6020 | void *cpu = (void *)(long)smp_processor_id(); |
5825 | int err; | 6021 | int err; |
5826 | 6022 | ||
5827 | /* Start one for the boot CPU: */ | 6023 | /* Initialize migration for the boot CPU */ |
5828 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 6024 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5829 | BUG_ON(err == NOTIFY_BAD); | 6025 | BUG_ON(err == NOTIFY_BAD); |
5830 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6026 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5831 | register_cpu_notifier(&migration_notifier); | 6027 | register_cpu_notifier(&migration_notifier); |
5832 | 6028 | ||
6029 | /* Register cpu active notifiers */ | ||
6030 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | ||
6031 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | ||
6032 | |||
5833 | return 0; | 6033 | return 0; |
5834 | } | 6034 | } |
5835 | early_initcall(migration_init); | 6035 | early_initcall(migration_init); |
@@ -6064,23 +6264,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6064 | free_rootdomain(old_rd); | 6264 | free_rootdomain(old_rd); |
6065 | } | 6265 | } |
6066 | 6266 | ||
6067 | static int init_rootdomain(struct root_domain *rd, bool bootmem) | 6267 | static int init_rootdomain(struct root_domain *rd) |
6068 | { | 6268 | { |
6069 | gfp_t gfp = GFP_KERNEL; | ||
6070 | |||
6071 | memset(rd, 0, sizeof(*rd)); | 6269 | memset(rd, 0, sizeof(*rd)); |
6072 | 6270 | ||
6073 | if (bootmem) | 6271 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
6074 | gfp = GFP_NOWAIT; | ||
6075 | |||
6076 | if (!alloc_cpumask_var(&rd->span, gfp)) | ||
6077 | goto out; | 6272 | goto out; |
6078 | if (!alloc_cpumask_var(&rd->online, gfp)) | 6273 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
6079 | goto free_span; | 6274 | goto free_span; |
6080 | if (!alloc_cpumask_var(&rd->rto_mask, gfp)) | 6275 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
6081 | goto free_online; | 6276 | goto free_online; |
6082 | 6277 | ||
6083 | if (cpupri_init(&rd->cpupri, bootmem) != 0) | 6278 | if (cpupri_init(&rd->cpupri) != 0) |
6084 | goto free_rto_mask; | 6279 | goto free_rto_mask; |
6085 | return 0; | 6280 | return 0; |
6086 | 6281 | ||
@@ -6096,7 +6291,7 @@ out: | |||
6096 | 6291 | ||
6097 | static void init_defrootdomain(void) | 6292 | static void init_defrootdomain(void) |
6098 | { | 6293 | { |
6099 | init_rootdomain(&def_root_domain, true); | 6294 | init_rootdomain(&def_root_domain); |
6100 | 6295 | ||
6101 | atomic_set(&def_root_domain.refcount, 1); | 6296 | atomic_set(&def_root_domain.refcount, 1); |
6102 | } | 6297 | } |
@@ -6109,7 +6304,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
6109 | if (!rd) | 6304 | if (!rd) |
6110 | return NULL; | 6305 | return NULL; |
6111 | 6306 | ||
6112 | if (init_rootdomain(rd, false) != 0) { | 6307 | if (init_rootdomain(rd) != 0) { |
6113 | kfree(rd); | 6308 | kfree(rd); |
6114 | return NULL; | 6309 | return NULL; |
6115 | } | 6310 | } |
@@ -7288,29 +7483,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7288 | } | 7483 | } |
7289 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7484 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7290 | 7485 | ||
7291 | #ifndef CONFIG_CPUSETS | ||
7292 | /* | 7486 | /* |
7293 | * Add online and remove offline CPUs from the scheduler domains. | 7487 | * Update cpusets according to cpu_active mask. If cpusets are |
7294 | * When cpusets are enabled they take over this function. | 7488 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
7489 | * around partition_sched_domains(). | ||
7295 | */ | 7490 | */ |
7296 | static int update_sched_domains(struct notifier_block *nfb, | 7491 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
7297 | unsigned long action, void *hcpu) | 7492 | void *hcpu) |
7298 | { | 7493 | { |
7299 | switch (action) { | 7494 | switch (action & ~CPU_TASKS_FROZEN) { |
7300 | case CPU_ONLINE: | 7495 | case CPU_ONLINE: |
7301 | case CPU_ONLINE_FROZEN: | ||
7302 | case CPU_DOWN_PREPARE: | ||
7303 | case CPU_DOWN_PREPARE_FROZEN: | ||
7304 | case CPU_DOWN_FAILED: | 7496 | case CPU_DOWN_FAILED: |
7305 | case CPU_DOWN_FAILED_FROZEN: | 7497 | cpuset_update_active_cpus(); |
7306 | partition_sched_domains(1, NULL, NULL); | ||
7307 | return NOTIFY_OK; | 7498 | return NOTIFY_OK; |
7499 | default: | ||
7500 | return NOTIFY_DONE; | ||
7501 | } | ||
7502 | } | ||
7308 | 7503 | ||
7504 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | ||
7505 | void *hcpu) | ||
7506 | { | ||
7507 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7508 | case CPU_DOWN_PREPARE: | ||
7509 | cpuset_update_active_cpus(); | ||
7510 | return NOTIFY_OK; | ||
7309 | default: | 7511 | default: |
7310 | return NOTIFY_DONE; | 7512 | return NOTIFY_DONE; |
7311 | } | 7513 | } |
7312 | } | 7514 | } |
7313 | #endif | ||
7314 | 7515 | ||
7315 | static int update_runtime(struct notifier_block *nfb, | 7516 | static int update_runtime(struct notifier_block *nfb, |
7316 | unsigned long action, void *hcpu) | 7517 | unsigned long action, void *hcpu) |
@@ -7356,10 +7557,8 @@ void __init sched_init_smp(void) | |||
7356 | mutex_unlock(&sched_domains_mutex); | 7557 | mutex_unlock(&sched_domains_mutex); |
7357 | put_online_cpus(); | 7558 | put_online_cpus(); |
7358 | 7559 | ||
7359 | #ifndef CONFIG_CPUSETS | 7560 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
7360 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7561 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
7361 | hotcpu_notifier(update_sched_domains, 0); | ||
7362 | #endif | ||
7363 | 7562 | ||
7364 | /* RT runtime code needs to handle some hotplug events */ | 7563 | /* RT runtime code needs to handle some hotplug events */ |
7365 | hotcpu_notifier(update_runtime, 0); | 7564 | hotcpu_notifier(update_runtime, 0); |
@@ -7604,6 +7803,9 @@ void __init sched_init(void) | |||
7604 | 7803 | ||
7605 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7804 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
7606 | rq->cpu_load[j] = 0; | 7805 | rq->cpu_load[j] = 0; |
7806 | |||
7807 | rq->last_load_update_tick = jiffies; | ||
7808 | |||
7607 | #ifdef CONFIG_SMP | 7809 | #ifdef CONFIG_SMP |
7608 | rq->sd = NULL; | 7810 | rq->sd = NULL; |
7609 | rq->rd = NULL; | 7811 | rq->rd = NULL; |
@@ -7617,6 +7819,10 @@ void __init sched_init(void) | |||
7617 | rq->idle_stamp = 0; | 7819 | rq->idle_stamp = 0; |
7618 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7820 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
7619 | rq_attach_root(rq, &def_root_domain); | 7821 | rq_attach_root(rq, &def_root_domain); |
7822 | #ifdef CONFIG_NO_HZ | ||
7823 | rq->nohz_balance_kick = 0; | ||
7824 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
7825 | #endif | ||
7620 | #endif | 7826 | #endif |
7621 | init_rq_hrtick(rq); | 7827 | init_rq_hrtick(rq); |
7622 | atomic_set(&rq->nr_iowait, 0); | 7828 | atomic_set(&rq->nr_iowait, 0); |
@@ -7661,8 +7867,11 @@ void __init sched_init(void) | |||
7661 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7867 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
7662 | #ifdef CONFIG_SMP | 7868 | #ifdef CONFIG_SMP |
7663 | #ifdef CONFIG_NO_HZ | 7869 | #ifdef CONFIG_NO_HZ |
7664 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 7870 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
7665 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 7871 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
7872 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
7873 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
7874 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
7666 | #endif | 7875 | #endif |
7667 | /* May be allocated at isolcpus cmdline parse time */ | 7876 | /* May be allocated at isolcpus cmdline parse time */ |
7668 | if (cpu_isolated_map == NULL) | 7877 | if (cpu_isolated_map == NULL) |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb3..52f1a149bfb1 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
@@ -10,19 +10,55 @@ | |||
10 | * Ingo Molnar <mingo@redhat.com> | 10 | * Ingo Molnar <mingo@redhat.com> |
11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
12 | * | 12 | * |
13 | * Create a semi stable clock from a mixture of other events, including: | 13 | * |
14 | * - gtod | 14 | * What: |
15 | * | ||
16 | * cpu_clock(i) provides a fast (execution time) high resolution | ||
17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | ||
18 | * is monotonic for constant i. The timestamp returned is in nanoseconds. | ||
19 | * | ||
20 | * ######################### BIG FAT WARNING ########################## | ||
21 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
22 | * # go backwards !! # | ||
23 | * #################################################################### | ||
24 | * | ||
25 | * There is no strict promise about the base, although it tends to start | ||
26 | * at 0 on boot (but people really shouldn't rely on that). | ||
27 | * | ||
28 | * cpu_clock(i) -- can be used from any context, including NMI. | ||
29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
30 | * local_clock() -- is cpu_clock() on the current cpu. | ||
31 | * | ||
32 | * How: | ||
33 | * | ||
34 | * The implementation either uses sched_clock() when | ||
35 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | ||
36 | * sched_clock() is assumed to provide these properties (mostly it means | ||
37 | * the architecture provides a globally synchronized highres time source). | ||
38 | * | ||
39 | * Otherwise it tries to create a semi stable clock from a mixture of other | ||
40 | * clocks, including: | ||
41 | * | ||
42 | * - GTOD (clock monotomic) | ||
15 | * - sched_clock() | 43 | * - sched_clock() |
16 | * - explicit idle events | 44 | * - explicit idle events |
17 | * | 45 | * |
18 | * We use gtod as base and the unstable clock deltas. The deltas are filtered, | 46 | * We use GTOD as base and use sched_clock() deltas to improve resolution. The |
19 | * making it monotonic and keeping it within an expected window. | 47 | * deltas are filtered to provide monotonicity and keeping it within an |
48 | * expected window. | ||
20 | * | 49 | * |
21 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
22 | * that is otherwise invisible (TSC gets stopped). | 51 | * that is otherwise invisible (TSC gets stopped). |
23 | * | 52 | * |
24 | * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat | 53 | * |
25 | * consistent between cpus (never more than 2 jiffies difference). | 54 | * Notes: |
55 | * | ||
56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
58 | * and cause funny jumps in time -- although the filtering provided by | ||
59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
61 | * sched_clock(). | ||
26 | */ | 62 | */ |
27 | #include <linux/spinlock.h> | 63 | #include <linux/spinlock.h> |
28 | #include <linux/hardirq.h> | 64 | #include <linux/hardirq.h> |
@@ -170,6 +206,11 @@ again: | |||
170 | return val; | 206 | return val; |
171 | } | 207 | } |
172 | 208 | ||
209 | /* | ||
210 | * Similar to cpu_clock(), but requires local IRQs to be disabled. | ||
211 | * | ||
212 | * See cpu_clock(). | ||
213 | */ | ||
173 | u64 sched_clock_cpu(int cpu) | 214 | u64 sched_clock_cpu(int cpu) |
174 | { | 215 | { |
175 | struct sched_clock_data *scd; | 216 | struct sched_clock_data *scd; |
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
237 | } | 278 | } |
238 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 279 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
239 | 280 | ||
240 | unsigned long long cpu_clock(int cpu) | 281 | /* |
282 | * As outlined at the top, provides a fast, high resolution, nanosecond | ||
283 | * time source that is monotonic per cpu argument and has bounded drift | ||
284 | * between cpus. | ||
285 | * | ||
286 | * ######################### BIG FAT WARNING ########################## | ||
287 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
288 | * # go backwards !! # | ||
289 | * #################################################################### | ||
290 | */ | ||
291 | u64 cpu_clock(int cpu) | ||
241 | { | 292 | { |
242 | unsigned long long clock; | 293 | u64 clock; |
243 | unsigned long flags; | 294 | unsigned long flags; |
244 | 295 | ||
245 | local_irq_save(flags); | 296 | local_irq_save(flags); |
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) | |||
249 | return clock; | 300 | return clock; |
250 | } | 301 | } |
251 | 302 | ||
303 | /* | ||
304 | * Similar to cpu_clock() for the current cpu. Time will only be observed | ||
305 | * to be monotonic if care is taken to only compare timestampt taken on the | ||
306 | * same CPU. | ||
307 | * | ||
308 | * See cpu_clock(). | ||
309 | */ | ||
310 | u64 local_clock(void) | ||
311 | { | ||
312 | u64 clock; | ||
313 | unsigned long flags; | ||
314 | |||
315 | local_irq_save(flags); | ||
316 | clock = sched_clock_cpu(smp_processor_id()); | ||
317 | local_irq_restore(flags); | ||
318 | |||
319 | return clock; | ||
320 | } | ||
321 | |||
252 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 322 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
253 | 323 | ||
254 | void sched_clock_init(void) | 324 | void sched_clock_init(void) |
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) | |||
264 | return sched_clock(); | 334 | return sched_clock(); |
265 | } | 335 | } |
266 | 336 | ||
267 | 337 | u64 cpu_clock(int cpu) | |
268 | unsigned long long cpu_clock(int cpu) | ||
269 | { | 338 | { |
270 | return sched_clock_cpu(cpu); | 339 | return sched_clock_cpu(cpu); |
271 | } | 340 | } |
272 | 341 | ||
342 | u64 local_clock(void) | ||
343 | { | ||
344 | return sched_clock_cpu(0); | ||
345 | } | ||
346 | |||
273 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 347 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
274 | 348 | ||
275 | EXPORT_SYMBOL_GPL(cpu_clock); | 349 | EXPORT_SYMBOL_GPL(cpu_clock); |
350 | EXPORT_SYMBOL_GPL(local_clock); | ||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc83..2722dc1b4138 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
166 | * | 166 | * |
167 | * Returns: -ENOMEM if memory fails. | 167 | * Returns: -ENOMEM if memory fails. |
168 | */ | 168 | */ |
169 | int cpupri_init(struct cpupri *cp, bool bootmem) | 169 | int cpupri_init(struct cpupri *cp) |
170 | { | 170 | { |
171 | gfp_t gfp = GFP_KERNEL; | ||
172 | int i; | 171 | int i; |
173 | 172 | ||
174 | if (bootmem) | ||
175 | gfp = GFP_NOWAIT; | ||
176 | |||
177 | memset(cp, 0, sizeof(*cp)); | 173 | memset(cp, 0, sizeof(*cp)); |
178 | 174 | ||
179 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) | |||
181 | 177 | ||
182 | raw_spin_lock_init(&vec->lock); | 178 | raw_spin_lock_init(&vec->lock); |
183 | vec->count = 0; | 179 | vec->count = 0; |
184 | if (!zalloc_cpumask_var(&vec->mask, gfp)) | 180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
185 | goto cleanup; | 181 | goto cleanup; |
186 | } | 182 | } |
187 | 183 | ||
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95be..9fc7d386fea4 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
@@ -27,7 +27,7 @@ struct cpupri { | |||
27 | int cpupri_find(struct cpupri *cp, | 27 | int cpupri_find(struct cpupri *cp, |
28 | struct task_struct *p, struct cpumask *lowest_mask); | 28 | struct task_struct *p, struct cpumask *lowest_mask); |
29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
30 | int cpupri_init(struct cpupri *cp, bool bootmem); | 30 | int cpupri_init(struct cpupri *cp); |
31 | void cpupri_cleanup(struct cpupri *cp); | 31 | void cpupri_cleanup(struct cpupri *cp); |
32 | #else | 32 | #else |
33 | #define cpupri_set(cp, cpu, pri) do { } while (0) | 33 | #define cpupri_set(cp, cpu, pri) do { } while (0) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00d..2e1b0d17dd9b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
332 | PN(sysctl_sched_latency); | 332 | PN(sysctl_sched_latency); |
333 | PN(sysctl_sched_min_granularity); | 333 | PN(sysctl_sched_min_granularity); |
334 | PN(sysctl_sched_wakeup_granularity); | 334 | PN(sysctl_sched_wakeup_granularity); |
335 | PN(sysctl_sched_child_runs_first); | 335 | P(sysctl_sched_child_runs_first); |
336 | P(sysctl_sched_features); | 336 | P(sysctl_sched_features); |
337 | #undef PN | 337 | #undef PN |
338 | #undef P | 338 | #undef P |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..db3f674ca49d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling | |||
54 | * Minimal preemption granularity for CPU-bound tasks: | 54 | * Minimal preemption granularity for CPU-bound tasks: |
55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) | 55 | * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) |
56 | */ | 56 | */ |
57 | unsigned int sysctl_sched_min_granularity = 2000000ULL; | 57 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
58 | unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; | 58 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 61 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
62 | */ | 62 | */ |
63 | static unsigned int sched_nr_latency = 3; | 63 | static unsigned int sched_nr_latency = 8; |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * After fork, child runs first. If set to 0 (default) then | 66 | * After fork, child runs first. If set to 0 (default) then |
@@ -1313,7 +1313,7 @@ static struct sched_group * | |||
1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, | 1313 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
1314 | int this_cpu, int load_idx) | 1314 | int this_cpu, int load_idx) |
1315 | { | 1315 | { |
1316 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1316 | struct sched_group *idlest = NULL, *group = sd->groups; |
1317 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1317 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1318 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1319 | 1319 | ||
@@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1348 | 1348 | ||
1349 | if (local_group) { | 1349 | if (local_group) { |
1350 | this_load = avg_load; | 1350 | this_load = avg_load; |
1351 | this = group; | ||
1352 | } else if (avg_load < min_load) { | 1351 | } else if (avg_load < min_load) { |
1353 | min_load = avg_load; | 1352 | min_load = avg_load; |
1354 | idlest = group; | 1353 | idlest = group; |
@@ -2268,8 +2267,6 @@ unsigned long scale_rt_power(int cpu) | |||
2268 | struct rq *rq = cpu_rq(cpu); | 2267 | struct rq *rq = cpu_rq(cpu); |
2269 | u64 total, available; | 2268 | u64 total, available; |
2270 | 2269 | ||
2271 | sched_avg_update(rq); | ||
2272 | |||
2273 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 2270 | total = sched_avg_period() + (rq->clock - rq->age_stamp); |
2274 | available = total - rq->rt_avg; | 2271 | available = total - rq->rt_avg; |
2275 | 2272 | ||
@@ -2287,13 +2284,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2287 | unsigned long power = SCHED_LOAD_SCALE; | 2284 | unsigned long power = SCHED_LOAD_SCALE; |
2288 | struct sched_group *sdg = sd->groups; | 2285 | struct sched_group *sdg = sd->groups; |
2289 | 2286 | ||
2290 | if (sched_feat(ARCH_POWER)) | ||
2291 | power *= arch_scale_freq_power(sd, cpu); | ||
2292 | else | ||
2293 | power *= default_scale_freq_power(sd, cpu); | ||
2294 | |||
2295 | power >>= SCHED_LOAD_SHIFT; | ||
2296 | |||
2297 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2287 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
2298 | if (sched_feat(ARCH_POWER)) | 2288 | if (sched_feat(ARCH_POWER)) |
2299 | power *= arch_scale_smt_power(sd, cpu); | 2289 | power *= arch_scale_smt_power(sd, cpu); |
@@ -2303,6 +2293,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2303 | power >>= SCHED_LOAD_SHIFT; | 2293 | power >>= SCHED_LOAD_SHIFT; |
2304 | } | 2294 | } |
2305 | 2295 | ||
2296 | sdg->cpu_power_orig = power; | ||
2297 | |||
2298 | if (sched_feat(ARCH_POWER)) | ||
2299 | power *= arch_scale_freq_power(sd, cpu); | ||
2300 | else | ||
2301 | power *= default_scale_freq_power(sd, cpu); | ||
2302 | |||
2303 | power >>= SCHED_LOAD_SHIFT; | ||
2304 | |||
2306 | power *= scale_rt_power(cpu); | 2305 | power *= scale_rt_power(cpu); |
2307 | power >>= SCHED_LOAD_SHIFT; | 2306 | power >>= SCHED_LOAD_SHIFT; |
2308 | 2307 | ||
@@ -2335,6 +2334,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
2335 | sdg->cpu_power = power; | 2334 | sdg->cpu_power = power; |
2336 | } | 2335 | } |
2337 | 2336 | ||
2337 | /* | ||
2338 | * Try and fix up capacity for tiny siblings, this is needed when | ||
2339 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | ||
2340 | * which on its own isn't powerful enough. | ||
2341 | * | ||
2342 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
2343 | */ | ||
2344 | static inline int | ||
2345 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | ||
2346 | { | ||
2347 | /* | ||
2348 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | ||
2349 | */ | ||
2350 | if (sd->level != SD_LV_SIBLING) | ||
2351 | return 0; | ||
2352 | |||
2353 | /* | ||
2354 | * If ~90% of the cpu_power is still there, we're good. | ||
2355 | */ | ||
2356 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | ||
2357 | return 1; | ||
2358 | |||
2359 | return 0; | ||
2360 | } | ||
2361 | |||
2338 | /** | 2362 | /** |
2339 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 2363 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
2340 | * @sd: The sched_domain whose statistics are to be updated. | 2364 | * @sd: The sched_domain whose statistics are to be updated. |
@@ -2400,14 +2424,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2400 | * domains. In the newly idle case, we will allow all the cpu's | 2424 | * domains. In the newly idle case, we will allow all the cpu's |
2401 | * to do the newly idle load balance. | 2425 | * to do the newly idle load balance. |
2402 | */ | 2426 | */ |
2403 | if (idle != CPU_NEWLY_IDLE && local_group && | 2427 | if (idle != CPU_NEWLY_IDLE && local_group) { |
2404 | balance_cpu != this_cpu) { | 2428 | if (balance_cpu != this_cpu) { |
2405 | *balance = 0; | 2429 | *balance = 0; |
2406 | return; | 2430 | return; |
2431 | } | ||
2432 | update_group_power(sd, this_cpu); | ||
2407 | } | 2433 | } |
2408 | 2434 | ||
2409 | update_group_power(sd, this_cpu); | ||
2410 | |||
2411 | /* Adjust by relative CPU power of the group */ | 2435 | /* Adjust by relative CPU power of the group */ |
2412 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2436 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2413 | 2437 | ||
@@ -2428,6 +2452,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2428 | 2452 | ||
2429 | sgs->group_capacity = | 2453 | sgs->group_capacity = |
2430 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2454 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
2455 | if (!sgs->group_capacity) | ||
2456 | sgs->group_capacity = fix_small_capacity(sd, group); | ||
2457 | } | ||
2458 | |||
2459 | /** | ||
2460 | * update_sd_pick_busiest - return 1 on busiest group | ||
2461 | * @sd: sched_domain whose statistics are to be checked | ||
2462 | * @sds: sched_domain statistics | ||
2463 | * @sg: sched_group candidate to be checked for being the busiest | ||
2464 | * @sgs: sched_group statistics | ||
2465 | * @this_cpu: the current cpu | ||
2466 | * | ||
2467 | * Determine if @sg is a busier group than the previously selected | ||
2468 | * busiest group. | ||
2469 | */ | ||
2470 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
2471 | struct sd_lb_stats *sds, | ||
2472 | struct sched_group *sg, | ||
2473 | struct sg_lb_stats *sgs, | ||
2474 | int this_cpu) | ||
2475 | { | ||
2476 | if (sgs->avg_load <= sds->max_load) | ||
2477 | return false; | ||
2478 | |||
2479 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
2480 | return true; | ||
2481 | |||
2482 | if (sgs->group_imb) | ||
2483 | return true; | ||
2484 | |||
2485 | /* | ||
2486 | * ASYM_PACKING needs to move all the work to the lowest | ||
2487 | * numbered CPUs in the group, therefore mark all groups | ||
2488 | * higher than ourself as busy. | ||
2489 | */ | ||
2490 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
2491 | this_cpu < group_first_cpu(sg)) { | ||
2492 | if (!sds->busiest) | ||
2493 | return true; | ||
2494 | |||
2495 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
2496 | return true; | ||
2497 | } | ||
2498 | |||
2499 | return false; | ||
2431 | } | 2500 | } |
2432 | 2501 | ||
2433 | /** | 2502 | /** |
@@ -2435,7 +2504,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2435 | * @sd: sched_domain whose statistics are to be updated. | 2504 | * @sd: sched_domain whose statistics are to be updated. |
2436 | * @this_cpu: Cpu for which load balance is currently performed. | 2505 | * @this_cpu: Cpu for which load balance is currently performed. |
2437 | * @idle: Idle status of this_cpu | 2506 | * @idle: Idle status of this_cpu |
2438 | * @sd_idle: Idle status of the sched_domain containing group. | 2507 | * @sd_idle: Idle status of the sched_domain containing sg. |
2439 | * @cpus: Set of cpus considered for load balancing. | 2508 | * @cpus: Set of cpus considered for load balancing. |
2440 | * @balance: Should we balance. | 2509 | * @balance: Should we balance. |
2441 | * @sds: variable to hold the statistics for this sched_domain. | 2510 | * @sds: variable to hold the statistics for this sched_domain. |
@@ -2446,7 +2515,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2446 | struct sd_lb_stats *sds) | 2515 | struct sd_lb_stats *sds) |
2447 | { | 2516 | { |
2448 | struct sched_domain *child = sd->child; | 2517 | struct sched_domain *child = sd->child; |
2449 | struct sched_group *group = sd->groups; | 2518 | struct sched_group *sg = sd->groups; |
2450 | struct sg_lb_stats sgs; | 2519 | struct sg_lb_stats sgs; |
2451 | int load_idx, prefer_sibling = 0; | 2520 | int load_idx, prefer_sibling = 0; |
2452 | 2521 | ||
@@ -2459,21 +2528,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2459 | do { | 2528 | do { |
2460 | int local_group; | 2529 | int local_group; |
2461 | 2530 | ||
2462 | local_group = cpumask_test_cpu(this_cpu, | 2531 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2463 | sched_group_cpus(group)); | ||
2464 | memset(&sgs, 0, sizeof(sgs)); | 2532 | memset(&sgs, 0, sizeof(sgs)); |
2465 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2533 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
2466 | local_group, cpus, balance, &sgs); | 2534 | local_group, cpus, balance, &sgs); |
2467 | 2535 | ||
2468 | if (local_group && !(*balance)) | 2536 | if (local_group && !(*balance)) |
2469 | return; | 2537 | return; |
2470 | 2538 | ||
2471 | sds->total_load += sgs.group_load; | 2539 | sds->total_load += sgs.group_load; |
2472 | sds->total_pwr += group->cpu_power; | 2540 | sds->total_pwr += sg->cpu_power; |
2473 | 2541 | ||
2474 | /* | 2542 | /* |
2475 | * In case the child domain prefers tasks go to siblings | 2543 | * In case the child domain prefers tasks go to siblings |
2476 | * first, lower the group capacity to one so that we'll try | 2544 | * first, lower the sg capacity to one so that we'll try |
2477 | * and move all the excess tasks away. | 2545 | * and move all the excess tasks away. |
2478 | */ | 2546 | */ |
2479 | if (prefer_sibling) | 2547 | if (prefer_sibling) |
@@ -2481,23 +2549,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2481 | 2549 | ||
2482 | if (local_group) { | 2550 | if (local_group) { |
2483 | sds->this_load = sgs.avg_load; | 2551 | sds->this_load = sgs.avg_load; |
2484 | sds->this = group; | 2552 | sds->this = sg; |
2485 | sds->this_nr_running = sgs.sum_nr_running; | 2553 | sds->this_nr_running = sgs.sum_nr_running; |
2486 | sds->this_load_per_task = sgs.sum_weighted_load; | 2554 | sds->this_load_per_task = sgs.sum_weighted_load; |
2487 | } else if (sgs.avg_load > sds->max_load && | 2555 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
2488 | (sgs.sum_nr_running > sgs.group_capacity || | ||
2489 | sgs.group_imb)) { | ||
2490 | sds->max_load = sgs.avg_load; | 2556 | sds->max_load = sgs.avg_load; |
2491 | sds->busiest = group; | 2557 | sds->busiest = sg; |
2492 | sds->busiest_nr_running = sgs.sum_nr_running; | 2558 | sds->busiest_nr_running = sgs.sum_nr_running; |
2493 | sds->busiest_group_capacity = sgs.group_capacity; | 2559 | sds->busiest_group_capacity = sgs.group_capacity; |
2494 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2560 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
2495 | sds->group_imb = sgs.group_imb; | 2561 | sds->group_imb = sgs.group_imb; |
2496 | } | 2562 | } |
2497 | 2563 | ||
2498 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2564 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
2499 | group = group->next; | 2565 | sg = sg->next; |
2500 | } while (group != sd->groups); | 2566 | } while (sg != sd->groups); |
2567 | } | ||
2568 | |||
2569 | int __weak arch_sd_sibling_asym_packing(void) | ||
2570 | { | ||
2571 | return 0*SD_ASYM_PACKING; | ||
2572 | } | ||
2573 | |||
2574 | /** | ||
2575 | * check_asym_packing - Check to see if the group is packed into the | ||
2576 | * sched doman. | ||
2577 | * | ||
2578 | * This is primarily intended to used at the sibling level. Some | ||
2579 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
2580 | * case of POWER7, it can move to lower SMT modes only when higher | ||
2581 | * threads are idle. When in lower SMT modes, the threads will | ||
2582 | * perform better since they share less core resources. Hence when we | ||
2583 | * have idle threads, we want them to be the higher ones. | ||
2584 | * | ||
2585 | * This packing function is run on idle threads. It checks to see if | ||
2586 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
2587 | * CPU number than the packing function is being run on. Here we are | ||
2588 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
2589 | * number. | ||
2590 | * | ||
2591 | * Returns 1 when packing is required and a task should be moved to | ||
2592 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
2593 | * | ||
2594 | * @sd: The sched_domain whose packing is to be checked. | ||
2595 | * @sds: Statistics of the sched_domain which is to be packed | ||
2596 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
2597 | * @imbalance: returns amount of imbalanced due to packing. | ||
2598 | */ | ||
2599 | static int check_asym_packing(struct sched_domain *sd, | ||
2600 | struct sd_lb_stats *sds, | ||
2601 | int this_cpu, unsigned long *imbalance) | ||
2602 | { | ||
2603 | int busiest_cpu; | ||
2604 | |||
2605 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
2606 | return 0; | ||
2607 | |||
2608 | if (!sds->busiest) | ||
2609 | return 0; | ||
2610 | |||
2611 | busiest_cpu = group_first_cpu(sds->busiest); | ||
2612 | if (this_cpu > busiest_cpu) | ||
2613 | return 0; | ||
2614 | |||
2615 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
2616 | SCHED_LOAD_SCALE); | ||
2617 | return 1; | ||
2501 | } | 2618 | } |
2502 | 2619 | ||
2503 | /** | 2620 | /** |
@@ -2692,6 +2809,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2692 | if (!(*balance)) | 2809 | if (!(*balance)) |
2693 | goto ret; | 2810 | goto ret; |
2694 | 2811 | ||
2812 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
2813 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
2814 | return sds.busiest; | ||
2815 | |||
2695 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2816 | if (!sds.busiest || sds.busiest_nr_running == 0) |
2696 | goto out_balanced; | 2817 | goto out_balanced; |
2697 | 2818 | ||
@@ -2726,8 +2847,9 @@ ret: | |||
2726 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2847 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2727 | */ | 2848 | */ |
2728 | static struct rq * | 2849 | static struct rq * |
2729 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2850 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, |
2730 | unsigned long imbalance, const struct cpumask *cpus) | 2851 | enum cpu_idle_type idle, unsigned long imbalance, |
2852 | const struct cpumask *cpus) | ||
2731 | { | 2853 | { |
2732 | struct rq *busiest = NULL, *rq; | 2854 | struct rq *busiest = NULL, *rq; |
2733 | unsigned long max_load = 0; | 2855 | unsigned long max_load = 0; |
@@ -2738,6 +2860,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2738 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 2860 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); |
2739 | unsigned long wl; | 2861 | unsigned long wl; |
2740 | 2862 | ||
2863 | if (!capacity) | ||
2864 | capacity = fix_small_capacity(sd, group); | ||
2865 | |||
2741 | if (!cpumask_test_cpu(i, cpus)) | 2866 | if (!cpumask_test_cpu(i, cpus)) |
2742 | continue; | 2867 | continue; |
2743 | 2868 | ||
@@ -2777,9 +2902,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
2777 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2902 | /* Working cpumask for load_balance and load_balance_newidle. */ |
2778 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2903 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
2779 | 2904 | ||
2780 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2905 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
2906 | int busiest_cpu, int this_cpu) | ||
2781 | { | 2907 | { |
2782 | if (idle == CPU_NEWLY_IDLE) { | 2908 | if (idle == CPU_NEWLY_IDLE) { |
2909 | |||
2910 | /* | ||
2911 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
2912 | * higher numbered CPUs in order to pack all tasks in the | ||
2913 | * lowest numbered CPUs. | ||
2914 | */ | ||
2915 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
2916 | return 1; | ||
2917 | |||
2783 | /* | 2918 | /* |
2784 | * The only task running in a non-idle cpu can be moved to this | 2919 | * The only task running in a non-idle cpu can be moved to this |
2785 | * cpu in an attempt to completely freeup the other CPU | 2920 | * cpu in an attempt to completely freeup the other CPU |
@@ -2854,7 +2989,7 @@ redo: | |||
2854 | goto out_balanced; | 2989 | goto out_balanced; |
2855 | } | 2990 | } |
2856 | 2991 | ||
2857 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | 2992 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); |
2858 | if (!busiest) { | 2993 | if (!busiest) { |
2859 | schedstat_inc(sd, lb_nobusyq[idle]); | 2994 | schedstat_inc(sd, lb_nobusyq[idle]); |
2860 | goto out_balanced; | 2995 | goto out_balanced; |
@@ -2898,7 +3033,8 @@ redo: | |||
2898 | schedstat_inc(sd, lb_failed[idle]); | 3033 | schedstat_inc(sd, lb_failed[idle]); |
2899 | sd->nr_balance_failed++; | 3034 | sd->nr_balance_failed++; |
2900 | 3035 | ||
2901 | if (need_active_balance(sd, sd_idle, idle)) { | 3036 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
3037 | this_cpu)) { | ||
2902 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3038 | raw_spin_lock_irqsave(&busiest->lock, flags); |
2903 | 3039 | ||
2904 | /* don't kick the active_load_balance_cpu_stop, | 3040 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3093,13 +3229,40 @@ out_unlock: | |||
3093 | } | 3229 | } |
3094 | 3230 | ||
3095 | #ifdef CONFIG_NO_HZ | 3231 | #ifdef CONFIG_NO_HZ |
3232 | |||
3233 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3234 | |||
3235 | static void trigger_sched_softirq(void *data) | ||
3236 | { | ||
3237 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3238 | } | ||
3239 | |||
3240 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3241 | { | ||
3242 | csd->func = trigger_sched_softirq; | ||
3243 | csd->info = NULL; | ||
3244 | csd->flags = 0; | ||
3245 | csd->priv = 0; | ||
3246 | } | ||
3247 | |||
3248 | /* | ||
3249 | * idle load balancing details | ||
3250 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3251 | * entering idle. | ||
3252 | * - This idle load balancer CPU will also go into tickless mode when | ||
3253 | * it is idle, just like all other idle CPUs | ||
3254 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
3255 | * needed, they will kick the idle load balancer, which then does idle | ||
3256 | * load balancing for all the idle CPUs. | ||
3257 | */ | ||
3096 | static struct { | 3258 | static struct { |
3097 | atomic_t load_balancer; | 3259 | atomic_t load_balancer; |
3098 | cpumask_var_t cpu_mask; | 3260 | atomic_t first_pick_cpu; |
3099 | cpumask_var_t ilb_grp_nohz_mask; | 3261 | atomic_t second_pick_cpu; |
3100 | } nohz ____cacheline_aligned = { | 3262 | cpumask_var_t idle_cpus_mask; |
3101 | .load_balancer = ATOMIC_INIT(-1), | 3263 | cpumask_var_t grp_idle_mask; |
3102 | }; | 3264 | unsigned long next_balance; /* in jiffy units */ |
3265 | } nohz ____cacheline_aligned; | ||
3103 | 3266 | ||
3104 | int get_nohz_load_balancer(void) | 3267 | int get_nohz_load_balancer(void) |
3105 | { | 3268 | { |
@@ -3153,17 +3316,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3153 | */ | 3316 | */ |
3154 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3317 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
3155 | { | 3318 | { |
3156 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3319 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
3157 | sched_group_cpus(ilb_group)); | 3320 | sched_group_cpus(ilb_group)); |
3158 | 3321 | ||
3159 | /* | 3322 | /* |
3160 | * A sched_group is semi-idle when it has atleast one busy cpu | 3323 | * A sched_group is semi-idle when it has atleast one busy cpu |
3161 | * and atleast one idle cpu. | 3324 | * and atleast one idle cpu. |
3162 | */ | 3325 | */ |
3163 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3326 | if (cpumask_empty(nohz.grp_idle_mask)) |
3164 | return 0; | 3327 | return 0; |
3165 | 3328 | ||
3166 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3329 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
3167 | return 0; | 3330 | return 0; |
3168 | 3331 | ||
3169 | return 1; | 3332 | return 1; |
@@ -3196,7 +3359,7 @@ static int find_new_ilb(int cpu) | |||
3196 | * Optimize for the case when we have no idle CPUs or only one | 3359 | * Optimize for the case when we have no idle CPUs or only one |
3197 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3360 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
3198 | */ | 3361 | */ |
3199 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3362 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3200 | goto out_done; | 3363 | goto out_done; |
3201 | 3364 | ||
3202 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3365 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
@@ -3204,7 +3367,7 @@ static int find_new_ilb(int cpu) | |||
3204 | 3367 | ||
3205 | do { | 3368 | do { |
3206 | if (is_semi_idle_group(ilb_group)) | 3369 | if (is_semi_idle_group(ilb_group)) |
3207 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3370 | return cpumask_first(nohz.grp_idle_mask); |
3208 | 3371 | ||
3209 | ilb_group = ilb_group->next; | 3372 | ilb_group = ilb_group->next; |
3210 | 3373 | ||
@@ -3212,98 +3375,116 @@ static int find_new_ilb(int cpu) | |||
3212 | } | 3375 | } |
3213 | 3376 | ||
3214 | out_done: | 3377 | out_done: |
3215 | return cpumask_first(nohz.cpu_mask); | 3378 | return nr_cpu_ids; |
3216 | } | 3379 | } |
3217 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3380 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3218 | static inline int find_new_ilb(int call_cpu) | 3381 | static inline int find_new_ilb(int call_cpu) |
3219 | { | 3382 | { |
3220 | return cpumask_first(nohz.cpu_mask); | 3383 | return nr_cpu_ids; |
3221 | } | 3384 | } |
3222 | #endif | 3385 | #endif |
3223 | 3386 | ||
3224 | /* | 3387 | /* |
3388 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
3389 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
3390 | * CPU (if there is one). | ||
3391 | */ | ||
3392 | static void nohz_balancer_kick(int cpu) | ||
3393 | { | ||
3394 | int ilb_cpu; | ||
3395 | |||
3396 | nohz.next_balance++; | ||
3397 | |||
3398 | ilb_cpu = get_nohz_load_balancer(); | ||
3399 | |||
3400 | if (ilb_cpu >= nr_cpu_ids) { | ||
3401 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3402 | if (ilb_cpu >= nr_cpu_ids) | ||
3403 | return; | ||
3404 | } | ||
3405 | |||
3406 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
3407 | struct call_single_data *cp; | ||
3408 | |||
3409 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
3410 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
3411 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
3412 | } | ||
3413 | return; | ||
3414 | } | ||
3415 | |||
3416 | /* | ||
3225 | * This routine will try to nominate the ilb (idle load balancing) | 3417 | * This routine will try to nominate the ilb (idle load balancing) |
3226 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3418 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
3227 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3419 | * load balancing on behalf of all those cpus. |
3228 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
3229 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
3230 | * arrives... | ||
3231 | * | ||
3232 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
3233 | * for idle load balancing. ilb owner will still be part of | ||
3234 | * nohz.cpu_mask.. | ||
3235 | * | 3420 | * |
3236 | * While stopping the tick, this cpu will become the ilb owner if there | 3421 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
3237 | * is no other owner. And will be the owner till that cpu becomes busy | 3422 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
3238 | * or if all cpus in the system stop their ticks at which point | 3423 | * idle load balancing by kicking one of the idle CPUs. |
3239 | * there is no need for ilb owner. | ||
3240 | * | 3424 | * |
3241 | * When the ilb owner becomes busy, it nominates another owner, during the | 3425 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
3242 | * next busy scheduler_tick() | 3426 | * ilb owner CPU in future (when there is a need for idle load balancing on |
3427 | * behalf of all idle CPUs). | ||
3243 | */ | 3428 | */ |
3244 | int select_nohz_load_balancer(int stop_tick) | 3429 | void select_nohz_load_balancer(int stop_tick) |
3245 | { | 3430 | { |
3246 | int cpu = smp_processor_id(); | 3431 | int cpu = smp_processor_id(); |
3247 | 3432 | ||
3248 | if (stop_tick) { | 3433 | if (stop_tick) { |
3249 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
3250 | |||
3251 | if (!cpu_active(cpu)) { | 3434 | if (!cpu_active(cpu)) { |
3252 | if (atomic_read(&nohz.load_balancer) != cpu) | 3435 | if (atomic_read(&nohz.load_balancer) != cpu) |
3253 | return 0; | 3436 | return; |
3254 | 3437 | ||
3255 | /* | 3438 | /* |
3256 | * If we are going offline and still the leader, | 3439 | * If we are going offline and still the leader, |
3257 | * give up! | 3440 | * give up! |
3258 | */ | 3441 | */ |
3259 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3442 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3443 | nr_cpu_ids) != cpu) | ||
3260 | BUG(); | 3444 | BUG(); |
3261 | 3445 | ||
3262 | return 0; | 3446 | return; |
3263 | } | 3447 | } |
3264 | 3448 | ||
3265 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3449 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3266 | 3450 | ||
3267 | /* time for ilb owner also to sleep */ | 3451 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
3268 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3452 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
3269 | if (atomic_read(&nohz.load_balancer) == cpu) | 3453 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
3270 | atomic_set(&nohz.load_balancer, -1); | 3454 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
3271 | return 0; | ||
3272 | } | ||
3273 | 3455 | ||
3274 | if (atomic_read(&nohz.load_balancer) == -1) { | 3456 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
3275 | /* make me the ilb owner */ | ||
3276 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
3277 | return 1; | ||
3278 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3279 | int new_ilb; | 3457 | int new_ilb; |
3280 | 3458 | ||
3281 | if (!(sched_smt_power_savings || | 3459 | /* make me the ilb owner */ |
3282 | sched_mc_power_savings)) | 3460 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
3283 | return 1; | 3461 | cpu) != nr_cpu_ids) |
3462 | return; | ||
3463 | |||
3284 | /* | 3464 | /* |
3285 | * Check to see if there is a more power-efficient | 3465 | * Check to see if there is a more power-efficient |
3286 | * ilb. | 3466 | * ilb. |
3287 | */ | 3467 | */ |
3288 | new_ilb = find_new_ilb(cpu); | 3468 | new_ilb = find_new_ilb(cpu); |
3289 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3469 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
3290 | atomic_set(&nohz.load_balancer, -1); | 3470 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
3291 | resched_cpu(new_ilb); | 3471 | resched_cpu(new_ilb); |
3292 | return 0; | 3472 | return; |
3293 | } | 3473 | } |
3294 | return 1; | 3474 | return; |
3295 | } | 3475 | } |
3296 | } else { | 3476 | } else { |
3297 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3477 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
3298 | return 0; | 3478 | return; |
3299 | 3479 | ||
3300 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3480 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
3301 | 3481 | ||
3302 | if (atomic_read(&nohz.load_balancer) == cpu) | 3482 | if (atomic_read(&nohz.load_balancer) == cpu) |
3303 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3483 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
3484 | nr_cpu_ids) != cpu) | ||
3304 | BUG(); | 3485 | BUG(); |
3305 | } | 3486 | } |
3306 | return 0; | 3487 | return; |
3307 | } | 3488 | } |
3308 | #endif | 3489 | #endif |
3309 | 3490 | ||
@@ -3385,11 +3566,102 @@ out: | |||
3385 | rq->next_balance = next_balance; | 3566 | rq->next_balance = next_balance; |
3386 | } | 3567 | } |
3387 | 3568 | ||
3569 | #ifdef CONFIG_NO_HZ | ||
3388 | /* | 3570 | /* |
3389 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3571 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
3390 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
3391 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3572 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3392 | */ | 3573 | */ |
3574 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
3575 | { | ||
3576 | struct rq *this_rq = cpu_rq(this_cpu); | ||
3577 | struct rq *rq; | ||
3578 | int balance_cpu; | ||
3579 | |||
3580 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
3581 | return; | ||
3582 | |||
3583 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
3584 | if (balance_cpu == this_cpu) | ||
3585 | continue; | ||
3586 | |||
3587 | /* | ||
3588 | * If this cpu gets work to do, stop the load balancing | ||
3589 | * work being done for other cpus. Next load | ||
3590 | * balancing owner will pick it up. | ||
3591 | */ | ||
3592 | if (need_resched()) { | ||
3593 | this_rq->nohz_balance_kick = 0; | ||
3594 | break; | ||
3595 | } | ||
3596 | |||
3597 | raw_spin_lock_irq(&this_rq->lock); | ||
3598 | update_rq_clock(this_rq); | ||
3599 | update_cpu_load(this_rq); | ||
3600 | raw_spin_unlock_irq(&this_rq->lock); | ||
3601 | |||
3602 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3603 | |||
3604 | rq = cpu_rq(balance_cpu); | ||
3605 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3606 | this_rq->next_balance = rq->next_balance; | ||
3607 | } | ||
3608 | nohz.next_balance = this_rq->next_balance; | ||
3609 | this_rq->nohz_balance_kick = 0; | ||
3610 | } | ||
3611 | |||
3612 | /* | ||
3613 | * Current heuristic for kicking the idle load balancer | ||
3614 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
3615 | * idle load balancer when it has more than one process active. This | ||
3616 | * eliminates the need for idle load balancing altogether when we have | ||
3617 | * only one running process in the system (common case). | ||
3618 | * - If there are more than one busy CPU, idle load balancer may have | ||
3619 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
3620 | * SMT or core siblings and can run better if they move to different | ||
3621 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
3622 | * which will kick idle load balancer as soon as it has any load. | ||
3623 | */ | ||
3624 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
3625 | { | ||
3626 | unsigned long now = jiffies; | ||
3627 | int ret; | ||
3628 | int first_pick_cpu, second_pick_cpu; | ||
3629 | |||
3630 | if (time_before(now, nohz.next_balance)) | ||
3631 | return 0; | ||
3632 | |||
3633 | if (rq->idle_at_tick) | ||
3634 | return 0; | ||
3635 | |||
3636 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
3637 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
3638 | |||
3639 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
3640 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
3641 | return 0; | ||
3642 | |||
3643 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
3644 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3645 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3646 | if (rq->nr_running > 1) | ||
3647 | return 1; | ||
3648 | } else { | ||
3649 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
3650 | if (ret == nr_cpu_ids || ret == cpu) { | ||
3651 | if (rq->nr_running) | ||
3652 | return 1; | ||
3653 | } | ||
3654 | } | ||
3655 | return 0; | ||
3656 | } | ||
3657 | #else | ||
3658 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
3659 | #endif | ||
3660 | |||
3661 | /* | ||
3662 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
3663 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
3664 | */ | ||
3393 | static void run_rebalance_domains(struct softirq_action *h) | 3665 | static void run_rebalance_domains(struct softirq_action *h) |
3394 | { | 3666 | { |
3395 | int this_cpu = smp_processor_id(); | 3667 | int this_cpu = smp_processor_id(); |
@@ -3399,37 +3671,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
3399 | 3671 | ||
3400 | rebalance_domains(this_cpu, idle); | 3672 | rebalance_domains(this_cpu, idle); |
3401 | 3673 | ||
3402 | #ifdef CONFIG_NO_HZ | ||
3403 | /* | 3674 | /* |
3404 | * If this cpu is the owner for idle load balancing, then do the | 3675 | * If this cpu has a pending nohz_balance_kick, then do the |
3405 | * balancing on behalf of the other idle cpus whose ticks are | 3676 | * balancing on behalf of the other idle cpus whose ticks are |
3406 | * stopped. | 3677 | * stopped. |
3407 | */ | 3678 | */ |
3408 | if (this_rq->idle_at_tick && | 3679 | nohz_idle_balance(this_cpu, idle); |
3409 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
3410 | struct rq *rq; | ||
3411 | int balance_cpu; | ||
3412 | |||
3413 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
3414 | if (balance_cpu == this_cpu) | ||
3415 | continue; | ||
3416 | |||
3417 | /* | ||
3418 | * If this cpu gets work to do, stop the load balancing | ||
3419 | * work being done for other cpus. Next load | ||
3420 | * balancing owner will pick it up. | ||
3421 | */ | ||
3422 | if (need_resched()) | ||
3423 | break; | ||
3424 | |||
3425 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
3426 | |||
3427 | rq = cpu_rq(balance_cpu); | ||
3428 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
3429 | this_rq->next_balance = rq->next_balance; | ||
3430 | } | ||
3431 | } | ||
3432 | #endif | ||
3433 | } | 3680 | } |
3434 | 3681 | ||
3435 | static inline int on_null_domain(int cpu) | 3682 | static inline int on_null_domain(int cpu) |
@@ -3439,57 +3686,17 @@ static inline int on_null_domain(int cpu) | |||
3439 | 3686 | ||
3440 | /* | 3687 | /* |
3441 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3688 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3442 | * | ||
3443 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
3444 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
3445 | * if the whole system is idle. | ||
3446 | */ | 3689 | */ |
3447 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3690 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3448 | { | 3691 | { |
3449 | #ifdef CONFIG_NO_HZ | ||
3450 | /* | ||
3451 | * If we were in the nohz mode recently and busy at the current | ||
3452 | * scheduler tick, then check if we need to nominate new idle | ||
3453 | * load balancer. | ||
3454 | */ | ||
3455 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
3456 | rq->in_nohz_recently = 0; | ||
3457 | |||
3458 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
3459 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
3460 | atomic_set(&nohz.load_balancer, -1); | ||
3461 | } | ||
3462 | |||
3463 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
3464 | int ilb = find_new_ilb(cpu); | ||
3465 | |||
3466 | if (ilb < nr_cpu_ids) | ||
3467 | resched_cpu(ilb); | ||
3468 | } | ||
3469 | } | ||
3470 | |||
3471 | /* | ||
3472 | * If this cpu is idle and doing idle load balancing for all the | ||
3473 | * cpus with ticks stopped, is it time for that to stop? | ||
3474 | */ | ||
3475 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
3476 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
3477 | resched_cpu(cpu); | ||
3478 | return; | ||
3479 | } | ||
3480 | |||
3481 | /* | ||
3482 | * If this cpu is idle and the idle load balancing is done by | ||
3483 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
3484 | */ | ||
3485 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
3486 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
3487 | return; | ||
3488 | #endif | ||
3489 | /* Don't need to rebalance while attached to NULL domain */ | 3692 | /* Don't need to rebalance while attached to NULL domain */ |
3490 | if (time_after_eq(jiffies, rq->next_balance) && | 3693 | if (time_after_eq(jiffies, rq->next_balance) && |
3491 | likely(!on_null_domain(cpu))) | 3694 | likely(!on_null_domain(cpu))) |
3492 | raise_softirq(SCHED_SOFTIRQ); | 3695 | raise_softirq(SCHED_SOFTIRQ); |
3696 | #ifdef CONFIG_NO_HZ | ||
3697 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
3698 | nohz_balancer_kick(cpu); | ||
3699 | #endif | ||
3493 | } | 3700 | } |
3494 | 3701 | ||
3495 | static void rq_online_fair(struct rq *rq) | 3702 | static void rq_online_fair(struct rq *rq) |
@@ -3542,6 +3749,8 @@ static void task_fork_fair(struct task_struct *p) | |||
3542 | 3749 | ||
3543 | raw_spin_lock_irqsave(&rq->lock, flags); | 3750 | raw_spin_lock_irqsave(&rq->lock, flags); |
3544 | 3751 | ||
3752 | update_rq_clock(rq); | ||
3753 | |||
3545 | if (unlikely(task_cpu(p) != this_cpu)) | 3754 | if (unlikely(task_cpu(p) != this_cpu)) |
3546 | __set_task_cpu(p, this_cpu); | 3755 | __set_task_cpu(p, this_cpu); |
3547 | 3756 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c6..d10c80ebb67a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
1663 | { | 1663 | { |
1664 | unsigned long soft, hard; | 1664 | unsigned long soft, hard; |
1665 | 1665 | ||
1666 | if (!p->signal) | ||
1667 | return; | ||
1668 | |||
1669 | /* max may change after cur was read, this will be fixed next tick */ | 1666 | /* max may change after cur was read, this will be fixed next tick */ |
1670 | soft = task_rlimit(p, RLIMIT_RTTIME); | 1667 | soft = task_rlimit(p, RLIMIT_RTTIME); |
1671 | hard = task_rlimit_max(p, RLIMIT_RTTIME); | 1668 | hard = task_rlimit_max(p, RLIMIT_RTTIME); |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b0..25c2f962f6fc 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
295 | static inline void account_group_user_time(struct task_struct *tsk, | 295 | static inline void account_group_user_time(struct task_struct *tsk, |
296 | cputime_t cputime) | 296 | cputime_t cputime) |
297 | { | 297 | { |
298 | struct thread_group_cputimer *cputimer; | 298 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
299 | |||
300 | /* tsk == current, ensure it is safe to use ->signal */ | ||
301 | if (unlikely(tsk->exit_state)) | ||
302 | return; | ||
303 | |||
304 | cputimer = &tsk->signal->cputimer; | ||
305 | 299 | ||
306 | if (!cputimer->running) | 300 | if (!cputimer->running) |
307 | return; | 301 | return; |
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
325 | static inline void account_group_system_time(struct task_struct *tsk, | 319 | static inline void account_group_system_time(struct task_struct *tsk, |
326 | cputime_t cputime) | 320 | cputime_t cputime) |
327 | { | 321 | { |
328 | struct thread_group_cputimer *cputimer; | 322 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
329 | |||
330 | /* tsk == current, ensure it is safe to use ->signal */ | ||
331 | if (unlikely(tsk->exit_state)) | ||
332 | return; | ||
333 | |||
334 | cputimer = &tsk->signal->cputimer; | ||
335 | 323 | ||
336 | if (!cputimer->running) | 324 | if (!cputimer->running) |
337 | return; | 325 | return; |
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
355 | static inline void account_group_exec_runtime(struct task_struct *tsk, | 343 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
356 | unsigned long long ns) | 344 | unsigned long long ns) |
357 | { | 345 | { |
358 | struct thread_group_cputimer *cputimer; | 346 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
359 | struct signal_struct *sig; | ||
360 | |||
361 | sig = tsk->signal; | ||
362 | /* see __exit_signal()->task_rq_unlock_wait() */ | ||
363 | barrier(); | ||
364 | if (unlikely(!sig)) | ||
365 | return; | ||
366 | |||
367 | cputimer = &sig->cputimer; | ||
368 | 347 | ||
369 | if (!cputimer->running) | 348 | if (!cputimer->running) |
370 | return; | 349 | return; |
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c43645298..000000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null | |||
@@ -1,227 +0,0 @@ | |||
1 | /* Slow work debugging | ||
2 | * | ||
3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/module.h> | ||
13 | #include <linux/slow-work.h> | ||
14 | #include <linux/fs.h> | ||
15 | #include <linux/time.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include "slow-work.h" | ||
18 | |||
19 | #define ITERATOR_SHIFT (BITS_PER_LONG - 4) | ||
20 | #define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) | ||
21 | #define ITERATOR_COUNTER (~ITERATOR_SELECTOR) | ||
22 | |||
23 | void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) | ||
24 | { | ||
25 | seq_puts(m, "Slow-work: New thread"); | ||
26 | } | ||
27 | |||
28 | /* | ||
29 | * Render the time mark field on a work item into a 5-char time with units plus | ||
30 | * a space | ||
31 | */ | ||
32 | static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) | ||
33 | { | ||
34 | struct timespec now, diff; | ||
35 | |||
36 | now = CURRENT_TIME; | ||
37 | diff = timespec_sub(now, work->mark); | ||
38 | |||
39 | if (diff.tv_sec < 0) | ||
40 | seq_puts(m, " -ve "); | ||
41 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) | ||
42 | seq_printf(m, "%3luns ", diff.tv_nsec); | ||
43 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) | ||
44 | seq_printf(m, "%3luus ", diff.tv_nsec / 1000); | ||
45 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) | ||
46 | seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); | ||
47 | else if (diff.tv_sec <= 1) | ||
48 | seq_puts(m, " 1s "); | ||
49 | else if (diff.tv_sec < 60) | ||
50 | seq_printf(m, "%4lus ", diff.tv_sec); | ||
51 | else if (diff.tv_sec < 60 * 60) | ||
52 | seq_printf(m, "%4lum ", diff.tv_sec / 60); | ||
53 | else if (diff.tv_sec < 60 * 60 * 24) | ||
54 | seq_printf(m, "%4luh ", diff.tv_sec / 3600); | ||
55 | else | ||
56 | seq_puts(m, "exces "); | ||
57 | } | ||
58 | |||
59 | /* | ||
60 | * Describe a slow work item for debugfs | ||
61 | */ | ||
62 | static int slow_work_runqueue_show(struct seq_file *m, void *v) | ||
63 | { | ||
64 | struct slow_work *work; | ||
65 | struct list_head *p = v; | ||
66 | unsigned long id; | ||
67 | |||
68 | switch ((unsigned long) v) { | ||
69 | case 1: | ||
70 | seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); | ||
71 | return 0; | ||
72 | case 2: | ||
73 | seq_puts(m, "=== ===== ================ == ===== ==========\n"); | ||
74 | return 0; | ||
75 | |||
76 | case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: | ||
77 | id = (unsigned long) v - 3; | ||
78 | |||
79 | read_lock(&slow_work_execs_lock); | ||
80 | work = slow_work_execs[id]; | ||
81 | if (work) { | ||
82 | smp_read_barrier_depends(); | ||
83 | |||
84 | seq_printf(m, "%3lu %5d %16p %2lx ", | ||
85 | id, slow_work_pids[id], work, work->flags); | ||
86 | slow_work_print_mark(m, work); | ||
87 | |||
88 | if (work->ops->desc) | ||
89 | work->ops->desc(work, m); | ||
90 | seq_putc(m, '\n'); | ||
91 | } | ||
92 | read_unlock(&slow_work_execs_lock); | ||
93 | return 0; | ||
94 | |||
95 | default: | ||
96 | work = list_entry(p, struct slow_work, link); | ||
97 | seq_printf(m, "%3s - %16p %2lx ", | ||
98 | work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", | ||
99 | work, work->flags); | ||
100 | slow_work_print_mark(m, work); | ||
101 | |||
102 | if (work->ops->desc) | ||
103 | work->ops->desc(work, m); | ||
104 | seq_putc(m, '\n'); | ||
105 | return 0; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * map the iterator to a work item | ||
111 | */ | ||
112 | static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) | ||
113 | { | ||
114 | struct list_head *p; | ||
115 | unsigned long count, id; | ||
116 | |||
117 | switch (*_pos >> ITERATOR_SHIFT) { | ||
118 | case 0x0: | ||
119 | if (*_pos == 0) | ||
120 | *_pos = 1; | ||
121 | if (*_pos < 3) | ||
122 | return (void *)(unsigned long) *_pos; | ||
123 | if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) | ||
124 | for (id = *_pos - 3; | ||
125 | id < SLOW_WORK_THREAD_LIMIT; | ||
126 | id++, (*_pos)++) | ||
127 | if (slow_work_execs[id]) | ||
128 | return (void *)(unsigned long) *_pos; | ||
129 | *_pos = 0x1UL << ITERATOR_SHIFT; | ||
130 | |||
131 | case 0x1: | ||
132 | count = *_pos & ITERATOR_COUNTER; | ||
133 | list_for_each(p, &slow_work_queue) { | ||
134 | if (count == 0) | ||
135 | return p; | ||
136 | count--; | ||
137 | } | ||
138 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
139 | |||
140 | case 0x2: | ||
141 | count = *_pos & ITERATOR_COUNTER; | ||
142 | list_for_each(p, &vslow_work_queue) { | ||
143 | if (count == 0) | ||
144 | return p; | ||
145 | count--; | ||
146 | } | ||
147 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
148 | |||
149 | default: | ||
150 | return NULL; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * set up the iterator to start reading from the first line | ||
156 | */ | ||
157 | static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) | ||
158 | { | ||
159 | spin_lock_irq(&slow_work_queue_lock); | ||
160 | return slow_work_runqueue_index(m, _pos); | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * move to the next line | ||
165 | */ | ||
166 | static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) | ||
167 | { | ||
168 | struct list_head *p = v; | ||
169 | unsigned long selector = *_pos >> ITERATOR_SHIFT; | ||
170 | |||
171 | (*_pos)++; | ||
172 | switch (selector) { | ||
173 | case 0x0: | ||
174 | return slow_work_runqueue_index(m, _pos); | ||
175 | |||
176 | case 0x1: | ||
177 | if (*_pos >> ITERATOR_SHIFT == 0x1) { | ||
178 | p = p->next; | ||
179 | if (p != &slow_work_queue) | ||
180 | return p; | ||
181 | } | ||
182 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
183 | p = &vslow_work_queue; | ||
184 | |||
185 | case 0x2: | ||
186 | if (*_pos >> ITERATOR_SHIFT == 0x2) { | ||
187 | p = p->next; | ||
188 | if (p != &vslow_work_queue) | ||
189 | return p; | ||
190 | } | ||
191 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
192 | |||
193 | default: | ||
194 | return NULL; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | /* | ||
199 | * clean up after reading | ||
200 | */ | ||
201 | static void slow_work_runqueue_stop(struct seq_file *m, void *v) | ||
202 | { | ||
203 | spin_unlock_irq(&slow_work_queue_lock); | ||
204 | } | ||
205 | |||
206 | static const struct seq_operations slow_work_runqueue_ops = { | ||
207 | .start = slow_work_runqueue_start, | ||
208 | .stop = slow_work_runqueue_stop, | ||
209 | .next = slow_work_runqueue_next, | ||
210 | .show = slow_work_runqueue_show, | ||
211 | }; | ||
212 | |||
213 | /* | ||
214 | * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents | ||
215 | */ | ||
216 | static int slow_work_runqueue_open(struct inode *inode, struct file *file) | ||
217 | { | ||
218 | return seq_open(file, &slow_work_runqueue_ops); | ||
219 | } | ||
220 | |||
221 | const struct file_operations slow_work_runqueue_fops = { | ||
222 | .owner = THIS_MODULE, | ||
223 | .open = slow_work_runqueue_open, | ||
224 | .read = seq_read, | ||
225 | .llseek = seq_lseek, | ||
226 | .release = seq_release, | ||
227 | }; | ||
diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4f..000000000000 --- a/kernel/slow-work.c +++ /dev/null | |||
@@ -1,1068 +0,0 @@ | |||
1 | /* Worker thread pool for slow items, such as filesystem lookups or mkdirs | ||
2 | * | ||
3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | * | ||
11 | * See Documentation/slow-work.txt | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/slow-work.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/freezer.h> | ||
18 | #include <linux/wait.h> | ||
19 | #include <linux/debugfs.h> | ||
20 | #include "slow-work.h" | ||
21 | |||
22 | static void slow_work_cull_timeout(unsigned long); | ||
23 | static void slow_work_oom_timeout(unsigned long); | ||
24 | |||
25 | #ifdef CONFIG_SYSCTL | ||
26 | static int slow_work_min_threads_sysctl(struct ctl_table *, int, | ||
27 | void __user *, size_t *, loff_t *); | ||
28 | |||
29 | static int slow_work_max_threads_sysctl(struct ctl_table *, int , | ||
30 | void __user *, size_t *, loff_t *); | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | * The pool of threads has at least min threads in it as long as someone is | ||
35 | * using the facility, and may have as many as max. | ||
36 | * | ||
37 | * A portion of the pool may be processing very slow operations. | ||
38 | */ | ||
39 | static unsigned slow_work_min_threads = 2; | ||
40 | static unsigned slow_work_max_threads = 4; | ||
41 | static unsigned vslow_work_proportion = 50; /* % of threads that may process | ||
42 | * very slow work */ | ||
43 | |||
44 | #ifdef CONFIG_SYSCTL | ||
45 | static const int slow_work_min_min_threads = 2; | ||
46 | static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; | ||
47 | static const int slow_work_min_vslow = 1; | ||
48 | static const int slow_work_max_vslow = 99; | ||
49 | |||
50 | ctl_table slow_work_sysctls[] = { | ||
51 | { | ||
52 | .procname = "min-threads", | ||
53 | .data = &slow_work_min_threads, | ||
54 | .maxlen = sizeof(unsigned), | ||
55 | .mode = 0644, | ||
56 | .proc_handler = slow_work_min_threads_sysctl, | ||
57 | .extra1 = (void *) &slow_work_min_min_threads, | ||
58 | .extra2 = &slow_work_max_threads, | ||
59 | }, | ||
60 | { | ||
61 | .procname = "max-threads", | ||
62 | .data = &slow_work_max_threads, | ||
63 | .maxlen = sizeof(unsigned), | ||
64 | .mode = 0644, | ||
65 | .proc_handler = slow_work_max_threads_sysctl, | ||
66 | .extra1 = &slow_work_min_threads, | ||
67 | .extra2 = (void *) &slow_work_max_max_threads, | ||
68 | }, | ||
69 | { | ||
70 | .procname = "vslow-percentage", | ||
71 | .data = &vslow_work_proportion, | ||
72 | .maxlen = sizeof(unsigned), | ||
73 | .mode = 0644, | ||
74 | .proc_handler = proc_dointvec_minmax, | ||
75 | .extra1 = (void *) &slow_work_min_vslow, | ||
76 | .extra2 = (void *) &slow_work_max_vslow, | ||
77 | }, | ||
78 | {} | ||
79 | }; | ||
80 | #endif | ||
81 | |||
82 | /* | ||
83 | * The active state of the thread pool | ||
84 | */ | ||
85 | static atomic_t slow_work_thread_count; | ||
86 | static atomic_t vslow_work_executing_count; | ||
87 | |||
88 | static bool slow_work_may_not_start_new_thread; | ||
89 | static bool slow_work_cull; /* cull a thread due to lack of activity */ | ||
90 | static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); | ||
91 | static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); | ||
92 | static struct slow_work slow_work_new_thread; /* new thread starter */ | ||
93 | |||
94 | /* | ||
95 | * slow work ID allocation (use slow_work_queue_lock) | ||
96 | */ | ||
97 | static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
98 | |||
99 | /* | ||
100 | * Unregistration tracking to prevent put_ref() from disappearing during module | ||
101 | * unload | ||
102 | */ | ||
103 | #ifdef CONFIG_MODULES | ||
104 | static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; | ||
105 | static struct module *slow_work_unreg_module; | ||
106 | static struct slow_work *slow_work_unreg_work_item; | ||
107 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); | ||
108 | static DEFINE_MUTEX(slow_work_unreg_sync_lock); | ||
109 | |||
110 | static void slow_work_set_thread_processing(int id, struct slow_work *work) | ||
111 | { | ||
112 | if (work) | ||
113 | slow_work_thread_processing[id] = work->owner; | ||
114 | } | ||
115 | static void slow_work_done_thread_processing(int id, struct slow_work *work) | ||
116 | { | ||
117 | struct module *module = slow_work_thread_processing[id]; | ||
118 | |||
119 | slow_work_thread_processing[id] = NULL; | ||
120 | smp_mb(); | ||
121 | if (slow_work_unreg_work_item == work || | ||
122 | slow_work_unreg_module == module) | ||
123 | wake_up_all(&slow_work_unreg_wq); | ||
124 | } | ||
125 | static void slow_work_clear_thread_processing(int id) | ||
126 | { | ||
127 | slow_work_thread_processing[id] = NULL; | ||
128 | } | ||
129 | #else | ||
130 | static void slow_work_set_thread_processing(int id, struct slow_work *work) {} | ||
131 | static void slow_work_done_thread_processing(int id, struct slow_work *work) {} | ||
132 | static void slow_work_clear_thread_processing(int id) {} | ||
133 | #endif | ||
134 | |||
135 | /* | ||
136 | * Data for tracking currently executing items for indication through /proc | ||
137 | */ | ||
138 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
139 | struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; | ||
140 | pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; | ||
141 | DEFINE_RWLOCK(slow_work_execs_lock); | ||
142 | #endif | ||
143 | |||
144 | /* | ||
145 | * The queues of work items and the lock governing access to them. These are | ||
146 | * shared between all the CPUs. It doesn't make sense to have per-CPU queues | ||
147 | * as the number of threads bears no relation to the number of CPUs. | ||
148 | * | ||
149 | * There are two queues of work items: one for slow work items, and one for | ||
150 | * very slow work items. | ||
151 | */ | ||
152 | LIST_HEAD(slow_work_queue); | ||
153 | LIST_HEAD(vslow_work_queue); | ||
154 | DEFINE_SPINLOCK(slow_work_queue_lock); | ||
155 | |||
156 | /* | ||
157 | * The following are two wait queues that get pinged when a work item is placed | ||
158 | * on an empty queue. These allow work items that are hogging a thread by | ||
159 | * sleeping in a way that could be deferred to yield their thread and enqueue | ||
160 | * themselves. | ||
161 | */ | ||
162 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); | ||
163 | static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); | ||
164 | |||
165 | /* | ||
166 | * The thread controls. A variable used to signal to the threads that they | ||
167 | * should exit when the queue is empty, a waitqueue used by the threads to wait | ||
168 | * for signals, and a completion set by the last thread to exit. | ||
169 | */ | ||
170 | static bool slow_work_threads_should_exit; | ||
171 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); | ||
172 | static DECLARE_COMPLETION(slow_work_last_thread_exited); | ||
173 | |||
174 | /* | ||
175 | * The number of users of the thread pool and its lock. Whilst this is zero we | ||
176 | * have no threads hanging around, and when this reaches zero, we wait for all | ||
177 | * active or queued work items to complete and kill all the threads we do have. | ||
178 | */ | ||
179 | static int slow_work_user_count; | ||
180 | static DEFINE_MUTEX(slow_work_user_lock); | ||
181 | |||
182 | static inline int slow_work_get_ref(struct slow_work *work) | ||
183 | { | ||
184 | if (work->ops->get_ref) | ||
185 | return work->ops->get_ref(work); | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static inline void slow_work_put_ref(struct slow_work *work) | ||
191 | { | ||
192 | if (work->ops->put_ref) | ||
193 | work->ops->put_ref(work); | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Calculate the maximum number of active threads in the pool that are | ||
198 | * permitted to process very slow work items. | ||
199 | * | ||
200 | * The answer is rounded up to at least 1, but may not equal or exceed the | ||
201 | * maximum number of the threads in the pool. This means we always have at | ||
202 | * least one thread that can process slow work items, and we always have at | ||
203 | * least one thread that won't get tied up doing so. | ||
204 | */ | ||
205 | static unsigned slow_work_calc_vsmax(void) | ||
206 | { | ||
207 | unsigned vsmax; | ||
208 | |||
209 | vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; | ||
210 | vsmax /= 100; | ||
211 | vsmax = max(vsmax, 1U); | ||
212 | return min(vsmax, slow_work_max_threads - 1); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Attempt to execute stuff queued on a slow thread. Return true if we managed | ||
217 | * it, false if there was nothing to do. | ||
218 | */ | ||
219 | static noinline bool slow_work_execute(int id) | ||
220 | { | ||
221 | struct slow_work *work = NULL; | ||
222 | unsigned vsmax; | ||
223 | bool very_slow; | ||
224 | |||
225 | vsmax = slow_work_calc_vsmax(); | ||
226 | |||
227 | /* see if we can schedule a new thread to be started if we're not | ||
228 | * keeping up with the work */ | ||
229 | if (!waitqueue_active(&slow_work_thread_wq) && | ||
230 | (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && | ||
231 | atomic_read(&slow_work_thread_count) < slow_work_max_threads && | ||
232 | !slow_work_may_not_start_new_thread) | ||
233 | slow_work_enqueue(&slow_work_new_thread); | ||
234 | |||
235 | /* find something to execute */ | ||
236 | spin_lock_irq(&slow_work_queue_lock); | ||
237 | if (!list_empty(&vslow_work_queue) && | ||
238 | atomic_read(&vslow_work_executing_count) < vsmax) { | ||
239 | work = list_entry(vslow_work_queue.next, | ||
240 | struct slow_work, link); | ||
241 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
242 | BUG(); | ||
243 | list_del_init(&work->link); | ||
244 | atomic_inc(&vslow_work_executing_count); | ||
245 | very_slow = true; | ||
246 | } else if (!list_empty(&slow_work_queue)) { | ||
247 | work = list_entry(slow_work_queue.next, | ||
248 | struct slow_work, link); | ||
249 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
250 | BUG(); | ||
251 | list_del_init(&work->link); | ||
252 | very_slow = false; | ||
253 | } else { | ||
254 | very_slow = false; /* avoid the compiler warning */ | ||
255 | } | ||
256 | |||
257 | slow_work_set_thread_processing(id, work); | ||
258 | if (work) { | ||
259 | slow_work_mark_time(work); | ||
260 | slow_work_begin_exec(id, work); | ||
261 | } | ||
262 | |||
263 | spin_unlock_irq(&slow_work_queue_lock); | ||
264 | |||
265 | if (!work) | ||
266 | return false; | ||
267 | |||
268 | if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) | ||
269 | BUG(); | ||
270 | |||
271 | /* don't execute if the work is in the process of being cancelled */ | ||
272 | if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
273 | work->ops->execute(work); | ||
274 | |||
275 | if (very_slow) | ||
276 | atomic_dec(&vslow_work_executing_count); | ||
277 | clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); | ||
278 | |||
279 | /* wake up anyone waiting for this work to be complete */ | ||
280 | wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); | ||
281 | |||
282 | slow_work_end_exec(id, work); | ||
283 | |||
284 | /* if someone tried to enqueue the item whilst we were executing it, | ||
285 | * then it'll be left unenqueued to avoid multiple threads trying to | ||
286 | * execute it simultaneously | ||
287 | * | ||
288 | * there is, however, a race between us testing the pending flag and | ||
289 | * getting the spinlock, and between the enqueuer setting the pending | ||
290 | * flag and getting the spinlock, so we use a deferral bit to tell us | ||
291 | * if the enqueuer got there first | ||
292 | */ | ||
293 | if (test_bit(SLOW_WORK_PENDING, &work->flags)) { | ||
294 | spin_lock_irq(&slow_work_queue_lock); | ||
295 | |||
296 | if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && | ||
297 | test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) | ||
298 | goto auto_requeue; | ||
299 | |||
300 | spin_unlock_irq(&slow_work_queue_lock); | ||
301 | } | ||
302 | |||
303 | /* sort out the race between module unloading and put_ref() */ | ||
304 | slow_work_put_ref(work); | ||
305 | slow_work_done_thread_processing(id, work); | ||
306 | |||
307 | return true; | ||
308 | |||
309 | auto_requeue: | ||
310 | /* we must complete the enqueue operation | ||
311 | * - we transfer our ref on the item back to the appropriate queue | ||
312 | * - don't wake another thread up as we're awake already | ||
313 | */ | ||
314 | slow_work_mark_time(work); | ||
315 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) | ||
316 | list_add_tail(&work->link, &vslow_work_queue); | ||
317 | else | ||
318 | list_add_tail(&work->link, &slow_work_queue); | ||
319 | spin_unlock_irq(&slow_work_queue_lock); | ||
320 | slow_work_clear_thread_processing(id); | ||
321 | return true; | ||
322 | } | ||
323 | |||
324 | /** | ||
325 | * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work | ||
326 | * work: The work item under execution that wants to sleep | ||
327 | * _timeout: Scheduler sleep timeout | ||
328 | * | ||
329 | * Allow a requeueable work item to sleep on a slow-work processor thread until | ||
330 | * that thread is needed to do some other work or the sleep is interrupted by | ||
331 | * some other event. | ||
332 | * | ||
333 | * The caller must set up a wake up event before calling this and must have set | ||
334 | * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own | ||
335 | * condition before calling this function as no test is made here. | ||
336 | * | ||
337 | * False is returned if there is nothing on the queue; true is returned if the | ||
338 | * work item should be requeued | ||
339 | */ | ||
340 | bool slow_work_sleep_till_thread_needed(struct slow_work *work, | ||
341 | signed long *_timeout) | ||
342 | { | ||
343 | wait_queue_head_t *wfo_wq; | ||
344 | struct list_head *queue; | ||
345 | |||
346 | DEFINE_WAIT(wait); | ||
347 | |||
348 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
349 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
350 | queue = &vslow_work_queue; | ||
351 | } else { | ||
352 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
353 | queue = &slow_work_queue; | ||
354 | } | ||
355 | |||
356 | if (!list_empty(queue)) | ||
357 | return true; | ||
358 | |||
359 | add_wait_queue_exclusive(wfo_wq, &wait); | ||
360 | if (list_empty(queue)) | ||
361 | *_timeout = schedule_timeout(*_timeout); | ||
362 | finish_wait(wfo_wq, &wait); | ||
363 | |||
364 | return !list_empty(queue); | ||
365 | } | ||
366 | EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); | ||
367 | |||
368 | /** | ||
369 | * slow_work_enqueue - Schedule a slow work item for processing | ||
370 | * @work: The work item to queue | ||
371 | * | ||
372 | * Schedule a slow work item for processing. If the item is already undergoing | ||
373 | * execution, this guarantees not to re-enter the execution routine until the | ||
374 | * first execution finishes. | ||
375 | * | ||
376 | * The item is pinned by this function as it retains a reference to it, managed | ||
377 | * through the item operations. The item is unpinned once it has been | ||
378 | * executed. | ||
379 | * | ||
380 | * An item may hog the thread that is running it for a relatively large amount | ||
381 | * of time, sufficient, for example, to perform several lookup, mkdir, create | ||
382 | * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. | ||
383 | * | ||
384 | * Conversely, if a number of items are awaiting processing, it may take some | ||
385 | * time before any given item is given attention. The number of threads in the | ||
386 | * pool may be increased to deal with demand, but only up to a limit. | ||
387 | * | ||
388 | * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in | ||
389 | * the very slow queue, from which only a portion of the threads will be | ||
390 | * allowed to pick items to execute. This ensures that very slow items won't | ||
391 | * overly block ones that are just ordinarily slow. | ||
392 | * | ||
393 | * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is | ||
394 | * attempted queued) | ||
395 | */ | ||
396 | int slow_work_enqueue(struct slow_work *work) | ||
397 | { | ||
398 | wait_queue_head_t *wfo_wq; | ||
399 | struct list_head *queue; | ||
400 | unsigned long flags; | ||
401 | int ret; | ||
402 | |||
403 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
404 | return -ECANCELED; | ||
405 | |||
406 | BUG_ON(slow_work_user_count <= 0); | ||
407 | BUG_ON(!work); | ||
408 | BUG_ON(!work->ops); | ||
409 | |||
410 | /* when honouring an enqueue request, we only promise that we will run | ||
411 | * the work function in the future; we do not promise to run it once | ||
412 | * per enqueue request | ||
413 | * | ||
414 | * we use the PENDING bit to merge together repeat requests without | ||
415 | * having to disable IRQs and take the spinlock, whilst still | ||
416 | * maintaining our promise | ||
417 | */ | ||
418 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
419 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
420 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
421 | queue = &vslow_work_queue; | ||
422 | } else { | ||
423 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
424 | queue = &slow_work_queue; | ||
425 | } | ||
426 | |||
427 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
428 | |||
429 | if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) | ||
430 | goto cancelled; | ||
431 | |||
432 | /* we promise that we will not attempt to execute the work | ||
433 | * function in more than one thread simultaneously | ||
434 | * | ||
435 | * this, however, leaves us with a problem if we're asked to | ||
436 | * enqueue the work whilst someone is executing the work | ||
437 | * function as simply queueing the work immediately means that | ||
438 | * another thread may try executing it whilst it is already | ||
439 | * under execution | ||
440 | * | ||
441 | * to deal with this, we set the ENQ_DEFERRED bit instead of | ||
442 | * enqueueing, and the thread currently executing the work | ||
443 | * function will enqueue the work item when the work function | ||
444 | * returns and it has cleared the EXECUTING bit | ||
445 | */ | ||
446 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
447 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
448 | } else { | ||
449 | ret = slow_work_get_ref(work); | ||
450 | if (ret < 0) | ||
451 | goto failed; | ||
452 | slow_work_mark_time(work); | ||
453 | list_add_tail(&work->link, queue); | ||
454 | wake_up(&slow_work_thread_wq); | ||
455 | |||
456 | /* if someone who could be requeued is sleeping on a | ||
457 | * thread, then ask them to yield their thread */ | ||
458 | if (work->link.prev == queue) | ||
459 | wake_up(wfo_wq); | ||
460 | } | ||
461 | |||
462 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
463 | } | ||
464 | return 0; | ||
465 | |||
466 | cancelled: | ||
467 | ret = -ECANCELED; | ||
468 | failed: | ||
469 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
470 | return ret; | ||
471 | } | ||
472 | EXPORT_SYMBOL(slow_work_enqueue); | ||
473 | |||
474 | static int slow_work_wait(void *word) | ||
475 | { | ||
476 | schedule(); | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | /** | ||
481 | * slow_work_cancel - Cancel a slow work item | ||
482 | * @work: The work item to cancel | ||
483 | * | ||
484 | * This function will cancel a previously enqueued work item. If we cannot | ||
485 | * cancel the work item, it is guarenteed to have run when this function | ||
486 | * returns. | ||
487 | */ | ||
488 | void slow_work_cancel(struct slow_work *work) | ||
489 | { | ||
490 | bool wait = true, put = false; | ||
491 | |||
492 | set_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
493 | smp_mb(); | ||
494 | |||
495 | /* if the work item is a delayed work item with an active timer, we | ||
496 | * need to wait for the timer to finish _before_ getting the spinlock, | ||
497 | * lest we deadlock against the timer routine | ||
498 | * | ||
499 | * the timer routine will leave DELAYED set if it notices the | ||
500 | * CANCELLING flag in time | ||
501 | */ | ||
502 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
503 | struct delayed_slow_work *dwork = | ||
504 | container_of(work, struct delayed_slow_work, work); | ||
505 | del_timer_sync(&dwork->timer); | ||
506 | } | ||
507 | |||
508 | spin_lock_irq(&slow_work_queue_lock); | ||
509 | |||
510 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
511 | /* the timer routine aborted or never happened, so we are left | ||
512 | * holding the timer's reference on the item and should just | ||
513 | * drop the pending flag and wait for any ongoing execution to | ||
514 | * finish */ | ||
515 | struct delayed_slow_work *dwork = | ||
516 | container_of(work, struct delayed_slow_work, work); | ||
517 | |||
518 | BUG_ON(timer_pending(&dwork->timer)); | ||
519 | BUG_ON(!list_empty(&work->link)); | ||
520 | |||
521 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
522 | put = true; | ||
523 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
524 | |||
525 | } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && | ||
526 | !list_empty(&work->link)) { | ||
527 | /* the link in the pending queue holds a reference on the item | ||
528 | * that we will need to release */ | ||
529 | list_del_init(&work->link); | ||
530 | wait = false; | ||
531 | put = true; | ||
532 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
533 | |||
534 | } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { | ||
535 | /* the executor is holding our only reference on the item, so | ||
536 | * we merely need to wait for it to finish executing */ | ||
537 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
538 | } | ||
539 | |||
540 | spin_unlock_irq(&slow_work_queue_lock); | ||
541 | |||
542 | /* the EXECUTING flag is set by the executor whilst the spinlock is set | ||
543 | * and before the item is dequeued - so assuming the above doesn't | ||
544 | * actually dequeue it, simply waiting for the EXECUTING flag to be | ||
545 | * released here should be sufficient */ | ||
546 | if (wait) | ||
547 | wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, | ||
548 | TASK_UNINTERRUPTIBLE); | ||
549 | |||
550 | clear_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
551 | if (put) | ||
552 | slow_work_put_ref(work); | ||
553 | } | ||
554 | EXPORT_SYMBOL(slow_work_cancel); | ||
555 | |||
556 | /* | ||
557 | * Handle expiry of the delay timer, indicating that a delayed slow work item | ||
558 | * should now be queued if not cancelled | ||
559 | */ | ||
560 | static void delayed_slow_work_timer(unsigned long data) | ||
561 | { | ||
562 | wait_queue_head_t *wfo_wq; | ||
563 | struct list_head *queue; | ||
564 | struct slow_work *work = (struct slow_work *) data; | ||
565 | unsigned long flags; | ||
566 | bool queued = false, put = false, first = false; | ||
567 | |||
568 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
569 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
570 | queue = &vslow_work_queue; | ||
571 | } else { | ||
572 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
573 | queue = &slow_work_queue; | ||
574 | } | ||
575 | |||
576 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
577 | if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { | ||
578 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
579 | |||
580 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
581 | /* we discard the reference the timer was holding in | ||
582 | * favour of the one the executor holds */ | ||
583 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
584 | put = true; | ||
585 | } else { | ||
586 | slow_work_mark_time(work); | ||
587 | list_add_tail(&work->link, queue); | ||
588 | queued = true; | ||
589 | if (work->link.prev == queue) | ||
590 | first = true; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
595 | if (put) | ||
596 | slow_work_put_ref(work); | ||
597 | if (first) | ||
598 | wake_up(wfo_wq); | ||
599 | if (queued) | ||
600 | wake_up(&slow_work_thread_wq); | ||
601 | } | ||
602 | |||
603 | /** | ||
604 | * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing | ||
605 | * @dwork: The delayed work item to queue | ||
606 | * @delay: When to start executing the work, in jiffies from now | ||
607 | * | ||
608 | * This is similar to slow_work_enqueue(), but it adds a delay before the work | ||
609 | * is actually queued for processing. | ||
610 | * | ||
611 | * The item can have delayed processing requested on it whilst it is being | ||
612 | * executed. The delay will begin immediately, and if it expires before the | ||
613 | * item finishes executing, the item will be placed back on the queue when it | ||
614 | * has done executing. | ||
615 | */ | ||
616 | int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, | ||
617 | unsigned long delay) | ||
618 | { | ||
619 | struct slow_work *work = &dwork->work; | ||
620 | unsigned long flags; | ||
621 | int ret; | ||
622 | |||
623 | if (delay == 0) | ||
624 | return slow_work_enqueue(&dwork->work); | ||
625 | |||
626 | BUG_ON(slow_work_user_count <= 0); | ||
627 | BUG_ON(!work); | ||
628 | BUG_ON(!work->ops); | ||
629 | |||
630 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
631 | return -ECANCELED; | ||
632 | |||
633 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
634 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
635 | |||
636 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
637 | goto cancelled; | ||
638 | |||
639 | /* the timer holds a reference whilst it is pending */ | ||
640 | ret = slow_work_get_ref(work); | ||
641 | if (ret < 0) | ||
642 | goto cant_get_ref; | ||
643 | |||
644 | if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) | ||
645 | BUG(); | ||
646 | dwork->timer.expires = jiffies + delay; | ||
647 | dwork->timer.data = (unsigned long) work; | ||
648 | dwork->timer.function = delayed_slow_work_timer; | ||
649 | add_timer(&dwork->timer); | ||
650 | |||
651 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
652 | } | ||
653 | |||
654 | return 0; | ||
655 | |||
656 | cancelled: | ||
657 | ret = -ECANCELED; | ||
658 | cant_get_ref: | ||
659 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
660 | return ret; | ||
661 | } | ||
662 | EXPORT_SYMBOL(delayed_slow_work_enqueue); | ||
663 | |||
664 | /* | ||
665 | * Schedule a cull of the thread pool at some time in the near future | ||
666 | */ | ||
667 | static void slow_work_schedule_cull(void) | ||
668 | { | ||
669 | mod_timer(&slow_work_cull_timer, | ||
670 | round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Worker thread culling algorithm | ||
675 | */ | ||
676 | static bool slow_work_cull_thread(void) | ||
677 | { | ||
678 | unsigned long flags; | ||
679 | bool do_cull = false; | ||
680 | |||
681 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
682 | |||
683 | if (slow_work_cull) { | ||
684 | slow_work_cull = false; | ||
685 | |||
686 | if (list_empty(&slow_work_queue) && | ||
687 | list_empty(&vslow_work_queue) && | ||
688 | atomic_read(&slow_work_thread_count) > | ||
689 | slow_work_min_threads) { | ||
690 | slow_work_schedule_cull(); | ||
691 | do_cull = true; | ||
692 | } | ||
693 | } | ||
694 | |||
695 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
696 | return do_cull; | ||
697 | } | ||
698 | |||
699 | /* | ||
700 | * Determine if there is slow work available for dispatch | ||
701 | */ | ||
702 | static inline bool slow_work_available(int vsmax) | ||
703 | { | ||
704 | return !list_empty(&slow_work_queue) || | ||
705 | (!list_empty(&vslow_work_queue) && | ||
706 | atomic_read(&vslow_work_executing_count) < vsmax); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Worker thread dispatcher | ||
711 | */ | ||
712 | static int slow_work_thread(void *_data) | ||
713 | { | ||
714 | int vsmax, id; | ||
715 | |||
716 | DEFINE_WAIT(wait); | ||
717 | |||
718 | set_freezable(); | ||
719 | set_user_nice(current, -5); | ||
720 | |||
721 | /* allocate ourselves an ID */ | ||
722 | spin_lock_irq(&slow_work_queue_lock); | ||
723 | id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
724 | BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); | ||
725 | __set_bit(id, slow_work_ids); | ||
726 | slow_work_set_thread_pid(id, current->pid); | ||
727 | spin_unlock_irq(&slow_work_queue_lock); | ||
728 | |||
729 | sprintf(current->comm, "kslowd%03u", id); | ||
730 | |||
731 | for (;;) { | ||
732 | vsmax = vslow_work_proportion; | ||
733 | vsmax *= atomic_read(&slow_work_thread_count); | ||
734 | vsmax /= 100; | ||
735 | |||
736 | prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, | ||
737 | TASK_INTERRUPTIBLE); | ||
738 | if (!freezing(current) && | ||
739 | !slow_work_threads_should_exit && | ||
740 | !slow_work_available(vsmax) && | ||
741 | !slow_work_cull) | ||
742 | schedule(); | ||
743 | finish_wait(&slow_work_thread_wq, &wait); | ||
744 | |||
745 | try_to_freeze(); | ||
746 | |||
747 | vsmax = vslow_work_proportion; | ||
748 | vsmax *= atomic_read(&slow_work_thread_count); | ||
749 | vsmax /= 100; | ||
750 | |||
751 | if (slow_work_available(vsmax) && slow_work_execute(id)) { | ||
752 | cond_resched(); | ||
753 | if (list_empty(&slow_work_queue) && | ||
754 | list_empty(&vslow_work_queue) && | ||
755 | atomic_read(&slow_work_thread_count) > | ||
756 | slow_work_min_threads) | ||
757 | slow_work_schedule_cull(); | ||
758 | continue; | ||
759 | } | ||
760 | |||
761 | if (slow_work_threads_should_exit) | ||
762 | break; | ||
763 | |||
764 | if (slow_work_cull && slow_work_cull_thread()) | ||
765 | break; | ||
766 | } | ||
767 | |||
768 | spin_lock_irq(&slow_work_queue_lock); | ||
769 | slow_work_set_thread_pid(id, 0); | ||
770 | __clear_bit(id, slow_work_ids); | ||
771 | spin_unlock_irq(&slow_work_queue_lock); | ||
772 | |||
773 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
774 | complete_and_exit(&slow_work_last_thread_exited, 0); | ||
775 | return 0; | ||
776 | } | ||
777 | |||
778 | /* | ||
779 | * Handle thread cull timer expiration | ||
780 | */ | ||
781 | static void slow_work_cull_timeout(unsigned long data) | ||
782 | { | ||
783 | slow_work_cull = true; | ||
784 | wake_up(&slow_work_thread_wq); | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * Start a new slow work thread | ||
789 | */ | ||
790 | static void slow_work_new_thread_execute(struct slow_work *work) | ||
791 | { | ||
792 | struct task_struct *p; | ||
793 | |||
794 | if (slow_work_threads_should_exit) | ||
795 | return; | ||
796 | |||
797 | if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) | ||
798 | return; | ||
799 | |||
800 | if (!mutex_trylock(&slow_work_user_lock)) | ||
801 | return; | ||
802 | |||
803 | slow_work_may_not_start_new_thread = true; | ||
804 | atomic_inc(&slow_work_thread_count); | ||
805 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
806 | if (IS_ERR(p)) { | ||
807 | printk(KERN_DEBUG "Slow work thread pool: OOM\n"); | ||
808 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
809 | BUG(); /* we're running on a slow work thread... */ | ||
810 | mod_timer(&slow_work_oom_timer, | ||
811 | round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); | ||
812 | } else { | ||
813 | /* ratelimit the starting of new threads */ | ||
814 | mod_timer(&slow_work_oom_timer, jiffies + 1); | ||
815 | } | ||
816 | |||
817 | mutex_unlock(&slow_work_user_lock); | ||
818 | } | ||
819 | |||
820 | static const struct slow_work_ops slow_work_new_thread_ops = { | ||
821 | .owner = THIS_MODULE, | ||
822 | .execute = slow_work_new_thread_execute, | ||
823 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
824 | .desc = slow_work_new_thread_desc, | ||
825 | #endif | ||
826 | }; | ||
827 | |||
828 | /* | ||
829 | * post-OOM new thread start suppression expiration | ||
830 | */ | ||
831 | static void slow_work_oom_timeout(unsigned long data) | ||
832 | { | ||
833 | slow_work_may_not_start_new_thread = false; | ||
834 | } | ||
835 | |||
836 | #ifdef CONFIG_SYSCTL | ||
837 | /* | ||
838 | * Handle adjustment of the minimum number of threads | ||
839 | */ | ||
840 | static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, | ||
841 | void __user *buffer, | ||
842 | size_t *lenp, loff_t *ppos) | ||
843 | { | ||
844 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
845 | int n; | ||
846 | |||
847 | if (ret == 0) { | ||
848 | mutex_lock(&slow_work_user_lock); | ||
849 | if (slow_work_user_count > 0) { | ||
850 | /* see if we need to start or stop threads */ | ||
851 | n = atomic_read(&slow_work_thread_count) - | ||
852 | slow_work_min_threads; | ||
853 | |||
854 | if (n < 0 && !slow_work_may_not_start_new_thread) | ||
855 | slow_work_enqueue(&slow_work_new_thread); | ||
856 | else if (n > 0) | ||
857 | slow_work_schedule_cull(); | ||
858 | } | ||
859 | mutex_unlock(&slow_work_user_lock); | ||
860 | } | ||
861 | |||
862 | return ret; | ||
863 | } | ||
864 | |||
865 | /* | ||
866 | * Handle adjustment of the maximum number of threads | ||
867 | */ | ||
868 | static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, | ||
869 | void __user *buffer, | ||
870 | size_t *lenp, loff_t *ppos) | ||
871 | { | ||
872 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
873 | int n; | ||
874 | |||
875 | if (ret == 0) { | ||
876 | mutex_lock(&slow_work_user_lock); | ||
877 | if (slow_work_user_count > 0) { | ||
878 | /* see if we need to stop threads */ | ||
879 | n = slow_work_max_threads - | ||
880 | atomic_read(&slow_work_thread_count); | ||
881 | |||
882 | if (n < 0) | ||
883 | slow_work_schedule_cull(); | ||
884 | } | ||
885 | mutex_unlock(&slow_work_user_lock); | ||
886 | } | ||
887 | |||
888 | return ret; | ||
889 | } | ||
890 | #endif /* CONFIG_SYSCTL */ | ||
891 | |||
892 | /** | ||
893 | * slow_work_register_user - Register a user of the facility | ||
894 | * @module: The module about to make use of the facility | ||
895 | * | ||
896 | * Register a user of the facility, starting up the initial threads if there | ||
897 | * aren't any other users at this point. This will return 0 if successful, or | ||
898 | * an error if not. | ||
899 | */ | ||
900 | int slow_work_register_user(struct module *module) | ||
901 | { | ||
902 | struct task_struct *p; | ||
903 | int loop; | ||
904 | |||
905 | mutex_lock(&slow_work_user_lock); | ||
906 | |||
907 | if (slow_work_user_count == 0) { | ||
908 | printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); | ||
909 | init_completion(&slow_work_last_thread_exited); | ||
910 | |||
911 | slow_work_threads_should_exit = false; | ||
912 | slow_work_init(&slow_work_new_thread, | ||
913 | &slow_work_new_thread_ops); | ||
914 | slow_work_may_not_start_new_thread = false; | ||
915 | slow_work_cull = false; | ||
916 | |||
917 | /* start the minimum number of threads */ | ||
918 | for (loop = 0; loop < slow_work_min_threads; loop++) { | ||
919 | atomic_inc(&slow_work_thread_count); | ||
920 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
921 | if (IS_ERR(p)) | ||
922 | goto error; | ||
923 | } | ||
924 | printk(KERN_NOTICE "Slow work thread pool: Ready\n"); | ||
925 | } | ||
926 | |||
927 | slow_work_user_count++; | ||
928 | mutex_unlock(&slow_work_user_lock); | ||
929 | return 0; | ||
930 | |||
931 | error: | ||
932 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
933 | complete(&slow_work_last_thread_exited); | ||
934 | if (loop > 0) { | ||
935 | printk(KERN_ERR "Slow work thread pool:" | ||
936 | " Aborting startup on ENOMEM\n"); | ||
937 | slow_work_threads_should_exit = true; | ||
938 | wake_up_all(&slow_work_thread_wq); | ||
939 | wait_for_completion(&slow_work_last_thread_exited); | ||
940 | printk(KERN_ERR "Slow work thread pool: Aborted\n"); | ||
941 | } | ||
942 | mutex_unlock(&slow_work_user_lock); | ||
943 | return PTR_ERR(p); | ||
944 | } | ||
945 | EXPORT_SYMBOL(slow_work_register_user); | ||
946 | |||
947 | /* | ||
948 | * wait for all outstanding items from the calling module to complete | ||
949 | * - note that more items may be queued whilst we're waiting | ||
950 | */ | ||
951 | static void slow_work_wait_for_items(struct module *module) | ||
952 | { | ||
953 | #ifdef CONFIG_MODULES | ||
954 | DECLARE_WAITQUEUE(myself, current); | ||
955 | struct slow_work *work; | ||
956 | int loop; | ||
957 | |||
958 | mutex_lock(&slow_work_unreg_sync_lock); | ||
959 | add_wait_queue(&slow_work_unreg_wq, &myself); | ||
960 | |||
961 | for (;;) { | ||
962 | spin_lock_irq(&slow_work_queue_lock); | ||
963 | |||
964 | /* first of all, we wait for the last queued item in each list | ||
965 | * to be processed */ | ||
966 | list_for_each_entry_reverse(work, &vslow_work_queue, link) { | ||
967 | if (work->owner == module) { | ||
968 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
969 | slow_work_unreg_work_item = work; | ||
970 | goto do_wait; | ||
971 | } | ||
972 | } | ||
973 | list_for_each_entry_reverse(work, &slow_work_queue, link) { | ||
974 | if (work->owner == module) { | ||
975 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
976 | slow_work_unreg_work_item = work; | ||
977 | goto do_wait; | ||
978 | } | ||
979 | } | ||
980 | |||
981 | /* then we wait for the items being processed to finish */ | ||
982 | slow_work_unreg_module = module; | ||
983 | smp_mb(); | ||
984 | for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { | ||
985 | if (slow_work_thread_processing[loop] == module) | ||
986 | goto do_wait; | ||
987 | } | ||
988 | spin_unlock_irq(&slow_work_queue_lock); | ||
989 | break; /* okay, we're done */ | ||
990 | |||
991 | do_wait: | ||
992 | spin_unlock_irq(&slow_work_queue_lock); | ||
993 | schedule(); | ||
994 | slow_work_unreg_work_item = NULL; | ||
995 | slow_work_unreg_module = NULL; | ||
996 | } | ||
997 | |||
998 | remove_wait_queue(&slow_work_unreg_wq, &myself); | ||
999 | mutex_unlock(&slow_work_unreg_sync_lock); | ||
1000 | #endif /* CONFIG_MODULES */ | ||
1001 | } | ||
1002 | |||
1003 | /** | ||
1004 | * slow_work_unregister_user - Unregister a user of the facility | ||
1005 | * @module: The module whose items should be cleared | ||
1006 | * | ||
1007 | * Unregister a user of the facility, killing all the threads if this was the | ||
1008 | * last one. | ||
1009 | * | ||
1010 | * This waits for all the work items belonging to the nominated module to go | ||
1011 | * away before proceeding. | ||
1012 | */ | ||
1013 | void slow_work_unregister_user(struct module *module) | ||
1014 | { | ||
1015 | /* first of all, wait for all outstanding items from the calling module | ||
1016 | * to complete */ | ||
1017 | if (module) | ||
1018 | slow_work_wait_for_items(module); | ||
1019 | |||
1020 | /* then we can actually go about shutting down the facility if need | ||
1021 | * be */ | ||
1022 | mutex_lock(&slow_work_user_lock); | ||
1023 | |||
1024 | BUG_ON(slow_work_user_count <= 0); | ||
1025 | |||
1026 | slow_work_user_count--; | ||
1027 | if (slow_work_user_count == 0) { | ||
1028 | printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); | ||
1029 | slow_work_threads_should_exit = true; | ||
1030 | del_timer_sync(&slow_work_cull_timer); | ||
1031 | del_timer_sync(&slow_work_oom_timer); | ||
1032 | wake_up_all(&slow_work_thread_wq); | ||
1033 | wait_for_completion(&slow_work_last_thread_exited); | ||
1034 | printk(KERN_NOTICE "Slow work thread pool:" | ||
1035 | " Shut down complete\n"); | ||
1036 | } | ||
1037 | |||
1038 | mutex_unlock(&slow_work_user_lock); | ||
1039 | } | ||
1040 | EXPORT_SYMBOL(slow_work_unregister_user); | ||
1041 | |||
1042 | /* | ||
1043 | * Initialise the slow work facility | ||
1044 | */ | ||
1045 | static int __init init_slow_work(void) | ||
1046 | { | ||
1047 | unsigned nr_cpus = num_possible_cpus(); | ||
1048 | |||
1049 | if (slow_work_max_threads < nr_cpus) | ||
1050 | slow_work_max_threads = nr_cpus; | ||
1051 | #ifdef CONFIG_SYSCTL | ||
1052 | if (slow_work_max_max_threads < nr_cpus * 2) | ||
1053 | slow_work_max_max_threads = nr_cpus * 2; | ||
1054 | #endif | ||
1055 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
1056 | { | ||
1057 | struct dentry *dbdir; | ||
1058 | |||
1059 | dbdir = debugfs_create_dir("slow_work", NULL); | ||
1060 | if (dbdir && !IS_ERR(dbdir)) | ||
1061 | debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, | ||
1062 | NULL, &slow_work_runqueue_fops); | ||
1063 | } | ||
1064 | #endif | ||
1065 | return 0; | ||
1066 | } | ||
1067 | |||
1068 | subsys_initcall(init_slow_work); | ||
diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41d..000000000000 --- a/kernel/slow-work.h +++ /dev/null | |||
@@ -1,72 +0,0 @@ | |||
1 | /* Slow work private definitions | ||
2 | * | ||
3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public Licence | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the Licence, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of | ||
13 | * things to do */ | ||
14 | #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after | ||
15 | * OOM */ | ||
16 | |||
17 | #define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ | ||
18 | |||
19 | /* | ||
20 | * slow-work.c | ||
21 | */ | ||
22 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
23 | extern struct slow_work *slow_work_execs[]; | ||
24 | extern pid_t slow_work_pids[]; | ||
25 | extern rwlock_t slow_work_execs_lock; | ||
26 | #endif | ||
27 | |||
28 | extern struct list_head slow_work_queue; | ||
29 | extern struct list_head vslow_work_queue; | ||
30 | extern spinlock_t slow_work_queue_lock; | ||
31 | |||
32 | /* | ||
33 | * slow-work-debugfs.c | ||
34 | */ | ||
35 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
36 | extern const struct file_operations slow_work_runqueue_fops; | ||
37 | |||
38 | extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); | ||
39 | #endif | ||
40 | |||
41 | /* | ||
42 | * Helper functions | ||
43 | */ | ||
44 | static inline void slow_work_set_thread_pid(int id, pid_t pid) | ||
45 | { | ||
46 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
47 | slow_work_pids[id] = pid; | ||
48 | #endif | ||
49 | } | ||
50 | |||
51 | static inline void slow_work_mark_time(struct slow_work *work) | ||
52 | { | ||
53 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
54 | work->mark = CURRENT_TIME; | ||
55 | #endif | ||
56 | } | ||
57 | |||
58 | static inline void slow_work_begin_exec(int id, struct slow_work *work) | ||
59 | { | ||
60 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
61 | slow_work_execs[id] = work; | ||
62 | #endif | ||
63 | } | ||
64 | |||
65 | static inline void slow_work_end_exec(int id, struct slow_work *work) | ||
66 | { | ||
67 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
68 | write_lock(&slow_work_execs_lock); | ||
69 | slow_work_execs[id] = NULL; | ||
70 | write_unlock(&slow_work_execs_lock); | ||
71 | #endif | ||
72 | } | ||
diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c715d3..ed6aacfcb7ef 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -365,9 +365,10 @@ call: | |||
365 | EXPORT_SYMBOL_GPL(smp_call_function_any); | 365 | EXPORT_SYMBOL_GPL(smp_call_function_any); |
366 | 366 | ||
367 | /** | 367 | /** |
368 | * __smp_call_function_single(): Run a function on another CPU | 368 | * __smp_call_function_single(): Run a function on a specific CPU |
369 | * @cpu: The CPU to run on. | 369 | * @cpu: The CPU to run on. |
370 | * @data: Pre-allocated and setup data structure | 370 | * @data: Pre-allocated and setup data structure |
371 | * @wait: If true, wait until function has completed on specified CPU. | ||
371 | * | 372 | * |
372 | * Like smp_call_function_single(), but allow caller to pass in a | 373 | * Like smp_call_function_single(), but allow caller to pass in a |
373 | * pre-allocated data structure. Useful for embedding @data inside | 374 | * pre-allocated data structure. Useful for embedding @data inside |
@@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); | |||
376 | void __smp_call_function_single(int cpu, struct call_single_data *data, | 377 | void __smp_call_function_single(int cpu, struct call_single_data *data, |
377 | int wait) | 378 | int wait) |
378 | { | 379 | { |
379 | csd_lock(data); | 380 | unsigned int this_cpu; |
381 | unsigned long flags; | ||
380 | 382 | ||
383 | this_cpu = get_cpu(); | ||
381 | /* | 384 | /* |
382 | * Can deadlock when called with interrupts disabled. | 385 | * Can deadlock when called with interrupts disabled. |
383 | * We allow cpu's that are not yet online though, as no one else can | 386 | * We allow cpu's that are not yet online though, as no one else can |
@@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, | |||
387 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() | 390 | WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() |
388 | && !oops_in_progress); | 391 | && !oops_in_progress); |
389 | 392 | ||
390 | generic_exec_single(cpu, data, wait); | 393 | if (cpu == this_cpu) { |
394 | local_irq_save(flags); | ||
395 | data->func(data->info); | ||
396 | local_irq_restore(flags); | ||
397 | } else { | ||
398 | csd_lock(data); | ||
399 | generic_exec_single(cpu, data, wait); | ||
400 | } | ||
401 | put_cpu(); | ||
391 | } | 402 | } |
392 | 403 | ||
393 | /** | 404 | /** |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb5..000000000000 --- a/kernel/softlockup.c +++ /dev/null | |||
@@ -1,293 +0,0 @@ | |||
1 | /* | ||
2 | * Detect Soft Lockups | ||
3 | * | ||
4 | * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. | ||
5 | * | ||
6 | * this code detects soft lockups: incidents in where on a CPU | ||
7 | * the kernel does not reschedule for 10 seconds or more. | ||
8 | */ | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/cpu.h> | ||
11 | #include <linux/nmi.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <linux/freezer.h> | ||
15 | #include <linux/kthread.h> | ||
16 | #include <linux/lockdep.h> | ||
17 | #include <linux/notifier.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | |||
21 | #include <asm/irq_regs.h> | ||
22 | |||
23 | static DEFINE_SPINLOCK(print_lock); | ||
24 | |||
25 | static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ | ||
26 | static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ | ||
27 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
28 | static DEFINE_PER_CPU(bool, softlock_touch_sync); | ||
29 | |||
30 | static int __read_mostly did_panic; | ||
31 | int __read_mostly softlockup_thresh = 60; | ||
32 | |||
33 | /* | ||
34 | * Should we panic (and reboot, if panic_timeout= is set) when a | ||
35 | * soft-lockup occurs: | ||
36 | */ | ||
37 | unsigned int __read_mostly softlockup_panic = | ||
38 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
39 | |||
40 | static int __init softlockup_panic_setup(char *str) | ||
41 | { | ||
42 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
43 | |||
44 | return 1; | ||
45 | } | ||
46 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
47 | |||
48 | static int | ||
49 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
50 | { | ||
51 | did_panic = 1; | ||
52 | |||
53 | return NOTIFY_DONE; | ||
54 | } | ||
55 | |||
56 | static struct notifier_block panic_block = { | ||
57 | .notifier_call = softlock_panic, | ||
58 | }; | ||
59 | |||
60 | /* | ||
61 | * Returns seconds, approximately. We don't need nanosecond | ||
62 | * resolution, and we don't need to waste time with a big divide when | ||
63 | * 2^30ns == 1.074s. | ||
64 | */ | ||
65 | static unsigned long get_timestamp(int this_cpu) | ||
66 | { | ||
67 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
68 | } | ||
69 | |||
70 | static void __touch_softlockup_watchdog(void) | ||
71 | { | ||
72 | int this_cpu = raw_smp_processor_id(); | ||
73 | |||
74 | __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); | ||
75 | } | ||
76 | |||
77 | void touch_softlockup_watchdog(void) | ||
78 | { | ||
79 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
80 | } | ||
81 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
82 | |||
83 | void touch_softlockup_watchdog_sync(void) | ||
84 | { | ||
85 | __raw_get_cpu_var(softlock_touch_sync) = true; | ||
86 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
87 | } | ||
88 | |||
89 | void touch_all_softlockup_watchdogs(void) | ||
90 | { | ||
91 | int cpu; | ||
92 | |||
93 | /* Cause each CPU to re-update its timestamp rather than complain */ | ||
94 | for_each_online_cpu(cpu) | ||
95 | per_cpu(softlockup_touch_ts, cpu) = 0; | ||
96 | } | ||
97 | EXPORT_SYMBOL(touch_all_softlockup_watchdogs); | ||
98 | |||
99 | int proc_dosoftlockup_thresh(struct ctl_table *table, int write, | ||
100 | void __user *buffer, | ||
101 | size_t *lenp, loff_t *ppos) | ||
102 | { | ||
103 | touch_all_softlockup_watchdogs(); | ||
104 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * This callback runs from the timer interrupt, and checks | ||
109 | * whether the watchdog thread has hung or not: | ||
110 | */ | ||
111 | void softlockup_tick(void) | ||
112 | { | ||
113 | int this_cpu = smp_processor_id(); | ||
114 | unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); | ||
115 | unsigned long print_ts; | ||
116 | struct pt_regs *regs = get_irq_regs(); | ||
117 | unsigned long now; | ||
118 | |||
119 | /* Is detection switched off? */ | ||
120 | if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { | ||
121 | /* Be sure we don't false trigger if switched back on */ | ||
122 | if (touch_ts) | ||
123 | per_cpu(softlockup_touch_ts, this_cpu) = 0; | ||
124 | return; | ||
125 | } | ||
126 | |||
127 | if (touch_ts == 0) { | ||
128 | if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { | ||
129 | /* | ||
130 | * If the time stamp was touched atomically | ||
131 | * make sure the scheduler tick is up to date. | ||
132 | */ | ||
133 | per_cpu(softlock_touch_sync, this_cpu) = false; | ||
134 | sched_clock_tick(); | ||
135 | } | ||
136 | __touch_softlockup_watchdog(); | ||
137 | return; | ||
138 | } | ||
139 | |||
140 | print_ts = per_cpu(softlockup_print_ts, this_cpu); | ||
141 | |||
142 | /* report at most once a second */ | ||
143 | if (print_ts == touch_ts || did_panic) | ||
144 | return; | ||
145 | |||
146 | /* do not print during early bootup: */ | ||
147 | if (unlikely(system_state != SYSTEM_RUNNING)) { | ||
148 | __touch_softlockup_watchdog(); | ||
149 | return; | ||
150 | } | ||
151 | |||
152 | now = get_timestamp(this_cpu); | ||
153 | |||
154 | /* | ||
155 | * Wake up the high-prio watchdog task twice per | ||
156 | * threshold timespan. | ||
157 | */ | ||
158 | if (time_after(now - softlockup_thresh/2, touch_ts)) | ||
159 | wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); | ||
160 | |||
161 | /* Warn about unreasonable delays: */ | ||
162 | if (time_before_eq(now - softlockup_thresh, touch_ts)) | ||
163 | return; | ||
164 | |||
165 | per_cpu(softlockup_print_ts, this_cpu) = touch_ts; | ||
166 | |||
167 | spin_lock(&print_lock); | ||
168 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", | ||
169 | this_cpu, now - touch_ts, | ||
170 | current->comm, task_pid_nr(current)); | ||
171 | print_modules(); | ||
172 | print_irqtrace_events(current); | ||
173 | if (regs) | ||
174 | show_regs(regs); | ||
175 | else | ||
176 | dump_stack(); | ||
177 | spin_unlock(&print_lock); | ||
178 | |||
179 | if (softlockup_panic) | ||
180 | panic("softlockup: hung tasks"); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * The watchdog thread - runs every second and touches the timestamp. | ||
185 | */ | ||
186 | static int watchdog(void *__bind_cpu) | ||
187 | { | ||
188 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
189 | |||
190 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
191 | |||
192 | /* initialize timestamp */ | ||
193 | __touch_softlockup_watchdog(); | ||
194 | |||
195 | set_current_state(TASK_INTERRUPTIBLE); | ||
196 | /* | ||
197 | * Run briefly once per second to reset the softlockup timestamp. | ||
198 | * If this gets delayed for more than 60 seconds then the | ||
199 | * debug-printout triggers in softlockup_tick(). | ||
200 | */ | ||
201 | while (!kthread_should_stop()) { | ||
202 | __touch_softlockup_watchdog(); | ||
203 | schedule(); | ||
204 | |||
205 | if (kthread_should_stop()) | ||
206 | break; | ||
207 | |||
208 | set_current_state(TASK_INTERRUPTIBLE); | ||
209 | } | ||
210 | __set_current_state(TASK_RUNNING); | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Create/destroy watchdog threads as CPUs come and go: | ||
217 | */ | ||
218 | static int __cpuinit | ||
219 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
220 | { | ||
221 | int hotcpu = (unsigned long)hcpu; | ||
222 | struct task_struct *p; | ||
223 | |||
224 | switch (action) { | ||
225 | case CPU_UP_PREPARE: | ||
226 | case CPU_UP_PREPARE_FROZEN: | ||
227 | BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); | ||
228 | p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | ||
229 | if (IS_ERR(p)) { | ||
230 | printk(KERN_ERR "watchdog for %i failed\n", hotcpu); | ||
231 | return NOTIFY_BAD; | ||
232 | } | ||
233 | per_cpu(softlockup_touch_ts, hotcpu) = 0; | ||
234 | per_cpu(softlockup_watchdog, hotcpu) = p; | ||
235 | kthread_bind(p, hotcpu); | ||
236 | break; | ||
237 | case CPU_ONLINE: | ||
238 | case CPU_ONLINE_FROZEN: | ||
239 | wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); | ||
240 | break; | ||
241 | #ifdef CONFIG_HOTPLUG_CPU | ||
242 | case CPU_UP_CANCELED: | ||
243 | case CPU_UP_CANCELED_FROZEN: | ||
244 | if (!per_cpu(softlockup_watchdog, hotcpu)) | ||
245 | break; | ||
246 | /* Unbind so it can run. Fall thru. */ | ||
247 | kthread_bind(per_cpu(softlockup_watchdog, hotcpu), | ||
248 | cpumask_any(cpu_online_mask)); | ||
249 | case CPU_DEAD: | ||
250 | case CPU_DEAD_FROZEN: | ||
251 | p = per_cpu(softlockup_watchdog, hotcpu); | ||
252 | per_cpu(softlockup_watchdog, hotcpu) = NULL; | ||
253 | kthread_stop(p); | ||
254 | break; | ||
255 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
256 | } | ||
257 | return NOTIFY_OK; | ||
258 | } | ||
259 | |||
260 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
261 | .notifier_call = cpu_callback | ||
262 | }; | ||
263 | |||
264 | static int __initdata nosoftlockup; | ||
265 | |||
266 | static int __init nosoftlockup_setup(char *str) | ||
267 | { | ||
268 | nosoftlockup = 1; | ||
269 | return 1; | ||
270 | } | ||
271 | __setup("nosoftlockup", nosoftlockup_setup); | ||
272 | |||
273 | static int __init spawn_softlockup_task(void) | ||
274 | { | ||
275 | void *cpu = (void *)(long)smp_processor_id(); | ||
276 | int err; | ||
277 | |||
278 | if (nosoftlockup) | ||
279 | return 0; | ||
280 | |||
281 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
282 | if (err == NOTIFY_BAD) { | ||
283 | BUG(); | ||
284 | return 1; | ||
285 | } | ||
286 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
287 | register_cpu_notifier(&cpu_nfb); | ||
288 | |||
289 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
290 | |||
291 | return 0; | ||
292 | } | ||
293 | early_initcall(spawn_softlockup_task); | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 70f8d90331e9..4372ccb25127 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -35,9 +35,9 @@ struct cpu_stop_done { | |||
35 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ | 35 | /* the actual stopper, one per every possible cpu, enabled on online cpus */ |
36 | struct cpu_stopper { | 36 | struct cpu_stopper { |
37 | spinlock_t lock; | 37 | spinlock_t lock; |
38 | bool enabled; /* is this stopper enabled? */ | ||
38 | struct list_head works; /* list of pending works */ | 39 | struct list_head works; /* list of pending works */ |
39 | struct task_struct *thread; /* stopper thread */ | 40 | struct task_struct *thread; /* stopper thread */ |
40 | bool enabled; /* is this stopper enabled? */ | ||
41 | }; | 41 | }; |
42 | 42 | ||
43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
diff --git a/kernel/sys.c b/kernel/sys.c index e83ddbbaf89d..7f5a0cd296a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
931 | pgid = pid; | 931 | pgid = pid; |
932 | if (pgid < 0) | 932 | if (pgid < 0) |
933 | return -EINVAL; | 933 | return -EINVAL; |
934 | rcu_read_lock(); | ||
934 | 935 | ||
935 | /* From this point forward we keep holding onto the tasklist lock | 936 | /* From this point forward we keep holding onto the tasklist lock |
936 | * so that our parent does not change from under us. -DaveM | 937 | * so that our parent does not change from under us. -DaveM |
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
984 | out: | 985 | out: |
985 | /* All paths lead to here, thus we are safe. -DaveM */ | 986 | /* All paths lead to here, thus we are safe. -DaveM */ |
986 | write_unlock_irq(&tasklist_lock); | 987 | write_unlock_irq(&tasklist_lock); |
988 | rcu_read_unlock(); | ||
987 | return err; | 989 | return err; |
988 | } | 990 | } |
989 | 991 | ||
@@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1236 | 1238 | ||
1237 | SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | 1239 | SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) |
1238 | { | 1240 | { |
1239 | if (resource >= RLIM_NLIMITS) | 1241 | struct rlimit value; |
1240 | return -EINVAL; | 1242 | int ret; |
1241 | else { | 1243 | |
1242 | struct rlimit value; | 1244 | ret = do_prlimit(current, resource, NULL, &value); |
1243 | task_lock(current->group_leader); | 1245 | if (!ret) |
1244 | value = current->signal->rlim[resource]; | 1246 | ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; |
1245 | task_unlock(current->group_leader); | 1247 | |
1246 | return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; | 1248 | return ret; |
1247 | } | ||
1248 | } | 1249 | } |
1249 | 1250 | ||
1250 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT | 1251 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT |
@@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
1272 | 1273 | ||
1273 | #endif | 1274 | #endif |
1274 | 1275 | ||
1275 | SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | 1276 | static inline bool rlim64_is_infinity(__u64 rlim64) |
1276 | { | 1277 | { |
1277 | struct rlimit new_rlim, *old_rlim; | 1278 | #if BITS_PER_LONG < 64 |
1278 | int retval; | 1279 | return rlim64 >= ULONG_MAX; |
1280 | #else | ||
1281 | return rlim64 == RLIM64_INFINITY; | ||
1282 | #endif | ||
1283 | } | ||
1284 | |||
1285 | static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) | ||
1286 | { | ||
1287 | if (rlim->rlim_cur == RLIM_INFINITY) | ||
1288 | rlim64->rlim_cur = RLIM64_INFINITY; | ||
1289 | else | ||
1290 | rlim64->rlim_cur = rlim->rlim_cur; | ||
1291 | if (rlim->rlim_max == RLIM_INFINITY) | ||
1292 | rlim64->rlim_max = RLIM64_INFINITY; | ||
1293 | else | ||
1294 | rlim64->rlim_max = rlim->rlim_max; | ||
1295 | } | ||
1296 | |||
1297 | static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) | ||
1298 | { | ||
1299 | if (rlim64_is_infinity(rlim64->rlim_cur)) | ||
1300 | rlim->rlim_cur = RLIM_INFINITY; | ||
1301 | else | ||
1302 | rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; | ||
1303 | if (rlim64_is_infinity(rlim64->rlim_max)) | ||
1304 | rlim->rlim_max = RLIM_INFINITY; | ||
1305 | else | ||
1306 | rlim->rlim_max = (unsigned long)rlim64->rlim_max; | ||
1307 | } | ||
1308 | |||
1309 | /* make sure you are allowed to change @tsk limits before calling this */ | ||
1310 | int do_prlimit(struct task_struct *tsk, unsigned int resource, | ||
1311 | struct rlimit *new_rlim, struct rlimit *old_rlim) | ||
1312 | { | ||
1313 | struct rlimit *rlim; | ||
1314 | int retval = 0; | ||
1279 | 1315 | ||
1280 | if (resource >= RLIM_NLIMITS) | 1316 | if (resource >= RLIM_NLIMITS) |
1281 | return -EINVAL; | 1317 | return -EINVAL; |
1282 | if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | 1318 | if (new_rlim) { |
1283 | return -EFAULT; | 1319 | if (new_rlim->rlim_cur > new_rlim->rlim_max) |
1284 | if (new_rlim.rlim_cur > new_rlim.rlim_max) | 1320 | return -EINVAL; |
1285 | return -EINVAL; | 1321 | if (resource == RLIMIT_NOFILE && |
1286 | old_rlim = current->signal->rlim + resource; | 1322 | new_rlim->rlim_max > sysctl_nr_open) |
1287 | if ((new_rlim.rlim_max > old_rlim->rlim_max) && | 1323 | return -EPERM; |
1288 | !capable(CAP_SYS_RESOURCE)) | ||
1289 | return -EPERM; | ||
1290 | if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) | ||
1291 | return -EPERM; | ||
1292 | |||
1293 | retval = security_task_setrlimit(resource, &new_rlim); | ||
1294 | if (retval) | ||
1295 | return retval; | ||
1296 | |||
1297 | if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { | ||
1298 | /* | ||
1299 | * The caller is asking for an immediate RLIMIT_CPU | ||
1300 | * expiry. But we use the zero value to mean "it was | ||
1301 | * never set". So let's cheat and make it one second | ||
1302 | * instead | ||
1303 | */ | ||
1304 | new_rlim.rlim_cur = 1; | ||
1305 | } | 1324 | } |
1306 | 1325 | ||
1307 | task_lock(current->group_leader); | 1326 | /* protect tsk->signal and tsk->sighand from disappearing */ |
1308 | *old_rlim = new_rlim; | 1327 | read_lock(&tasklist_lock); |
1309 | task_unlock(current->group_leader); | 1328 | if (!tsk->sighand) { |
1310 | 1329 | retval = -ESRCH; | |
1311 | if (resource != RLIMIT_CPU) | ||
1312 | goto out; | 1330 | goto out; |
1331 | } | ||
1332 | |||
1333 | rlim = tsk->signal->rlim + resource; | ||
1334 | task_lock(tsk->group_leader); | ||
1335 | if (new_rlim) { | ||
1336 | if (new_rlim->rlim_max > rlim->rlim_max && | ||
1337 | !capable(CAP_SYS_RESOURCE)) | ||
1338 | retval = -EPERM; | ||
1339 | if (!retval) | ||
1340 | retval = security_task_setrlimit(tsk->group_leader, | ||
1341 | resource, new_rlim); | ||
1342 | if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { | ||
1343 | /* | ||
1344 | * The caller is asking for an immediate RLIMIT_CPU | ||
1345 | * expiry. But we use the zero value to mean "it was | ||
1346 | * never set". So let's cheat and make it one second | ||
1347 | * instead | ||
1348 | */ | ||
1349 | new_rlim->rlim_cur = 1; | ||
1350 | } | ||
1351 | } | ||
1352 | if (!retval) { | ||
1353 | if (old_rlim) | ||
1354 | *old_rlim = *rlim; | ||
1355 | if (new_rlim) | ||
1356 | *rlim = *new_rlim; | ||
1357 | } | ||
1358 | task_unlock(tsk->group_leader); | ||
1313 | 1359 | ||
1314 | /* | 1360 | /* |
1315 | * RLIMIT_CPU handling. Note that the kernel fails to return an error | 1361 | * RLIMIT_CPU handling. Note that the kernel fails to return an error |
@@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
1317 | * very long-standing error, and fixing it now risks breakage of | 1363 | * very long-standing error, and fixing it now risks breakage of |
1318 | * applications, so we live with it | 1364 | * applications, so we live with it |
1319 | */ | 1365 | */ |
1320 | if (new_rlim.rlim_cur == RLIM_INFINITY) | 1366 | if (!retval && new_rlim && resource == RLIMIT_CPU && |
1321 | goto out; | 1367 | new_rlim->rlim_cur != RLIM_INFINITY) |
1322 | 1368 | update_rlimit_cpu(tsk, new_rlim->rlim_cur); | |
1323 | update_rlimit_cpu(new_rlim.rlim_cur); | ||
1324 | out: | 1369 | out: |
1370 | read_unlock(&tasklist_lock); | ||
1371 | return retval; | ||
1372 | } | ||
1373 | |||
1374 | /* rcu lock must be held */ | ||
1375 | static int check_prlimit_permission(struct task_struct *task) | ||
1376 | { | ||
1377 | const struct cred *cred = current_cred(), *tcred; | ||
1378 | |||
1379 | tcred = __task_cred(task); | ||
1380 | if ((cred->uid != tcred->euid || | ||
1381 | cred->uid != tcred->suid || | ||
1382 | cred->uid != tcred->uid || | ||
1383 | cred->gid != tcred->egid || | ||
1384 | cred->gid != tcred->sgid || | ||
1385 | cred->gid != tcred->gid) && | ||
1386 | !capable(CAP_SYS_RESOURCE)) { | ||
1387 | return -EPERM; | ||
1388 | } | ||
1389 | |||
1325 | return 0; | 1390 | return 0; |
1326 | } | 1391 | } |
1327 | 1392 | ||
1393 | SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, | ||
1394 | const struct rlimit64 __user *, new_rlim, | ||
1395 | struct rlimit64 __user *, old_rlim) | ||
1396 | { | ||
1397 | struct rlimit64 old64, new64; | ||
1398 | struct rlimit old, new; | ||
1399 | struct task_struct *tsk; | ||
1400 | int ret; | ||
1401 | |||
1402 | if (new_rlim) { | ||
1403 | if (copy_from_user(&new64, new_rlim, sizeof(new64))) | ||
1404 | return -EFAULT; | ||
1405 | rlim64_to_rlim(&new64, &new); | ||
1406 | } | ||
1407 | |||
1408 | rcu_read_lock(); | ||
1409 | tsk = pid ? find_task_by_vpid(pid) : current; | ||
1410 | if (!tsk) { | ||
1411 | rcu_read_unlock(); | ||
1412 | return -ESRCH; | ||
1413 | } | ||
1414 | ret = check_prlimit_permission(tsk); | ||
1415 | if (ret) { | ||
1416 | rcu_read_unlock(); | ||
1417 | return ret; | ||
1418 | } | ||
1419 | get_task_struct(tsk); | ||
1420 | rcu_read_unlock(); | ||
1421 | |||
1422 | ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, | ||
1423 | old_rlim ? &old : NULL); | ||
1424 | |||
1425 | if (!ret && old_rlim) { | ||
1426 | rlim_to_rlim64(&old, &old64); | ||
1427 | if (copy_to_user(old_rlim, &old64, sizeof(old64))) | ||
1428 | ret = -EFAULT; | ||
1429 | } | ||
1430 | |||
1431 | put_task_struct(tsk); | ||
1432 | return ret; | ||
1433 | } | ||
1434 | |||
1435 | SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) | ||
1436 | { | ||
1437 | struct rlimit new_rlim; | ||
1438 | |||
1439 | if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) | ||
1440 | return -EFAULT; | ||
1441 | return do_prlimit(current, resource, &new_rlim, NULL); | ||
1442 | } | ||
1443 | |||
1328 | /* | 1444 | /* |
1329 | * It would make sense to put struct rusage in the task_struct, | 1445 | * It would make sense to put struct rusage in the task_struct, |
1330 | * except that would make the task_struct be *really big*. After | 1446 | * except that would make the task_struct be *really big*. After |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea758ffe..bad369ec5403 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2); | |||
181 | 181 | ||
182 | /* performance counters: */ | 182 | /* performance counters: */ |
183 | cond_syscall(sys_perf_event_open); | 183 | cond_syscall(sys_perf_event_open); |
184 | |||
185 | /* fanotify! */ | ||
186 | cond_syscall(sys_fanotify_init); | ||
187 | cond_syscall(sys_fanotify_mark); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761f4876..f88552c6d227 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -44,16 +44,17 @@ | |||
44 | #include <linux/times.h> | 44 | #include <linux/times.h> |
45 | #include <linux/limits.h> | 45 | #include <linux/limits.h> |
46 | #include <linux/dcache.h> | 46 | #include <linux/dcache.h> |
47 | #include <linux/dnotify.h> | ||
47 | #include <linux/syscalls.h> | 48 | #include <linux/syscalls.h> |
48 | #include <linux/vmstat.h> | 49 | #include <linux/vmstat.h> |
49 | #include <linux/nfs_fs.h> | 50 | #include <linux/nfs_fs.h> |
50 | #include <linux/acpi.h> | 51 | #include <linux/acpi.h> |
51 | #include <linux/reboot.h> | 52 | #include <linux/reboot.h> |
52 | #include <linux/ftrace.h> | 53 | #include <linux/ftrace.h> |
53 | #include <linux/slow-work.h> | ||
54 | #include <linux/perf_event.h> | 54 | #include <linux/perf_event.h> |
55 | #include <linux/kprobes.h> | 55 | #include <linux/kprobes.h> |
56 | #include <linux/pipe_fs_i.h> | 56 | #include <linux/pipe_fs_i.h> |
57 | #include <linux/oom.h> | ||
57 | 58 | ||
58 | #include <asm/uaccess.h> | 59 | #include <asm/uaccess.h> |
59 | #include <asm/processor.h> | 60 | #include <asm/processor.h> |
@@ -76,15 +77,16 @@ | |||
76 | #include <scsi/sg.h> | 77 | #include <scsi/sg.h> |
77 | #endif | 78 | #endif |
78 | 79 | ||
80 | #ifdef CONFIG_LOCKUP_DETECTOR | ||
81 | #include <linux/nmi.h> | ||
82 | #endif | ||
83 | |||
79 | 84 | ||
80 | #if defined(CONFIG_SYSCTL) | 85 | #if defined(CONFIG_SYSCTL) |
81 | 86 | ||
82 | /* External variables not in a header file. */ | 87 | /* External variables not in a header file. */ |
83 | extern int sysctl_overcommit_memory; | 88 | extern int sysctl_overcommit_memory; |
84 | extern int sysctl_overcommit_ratio; | 89 | extern int sysctl_overcommit_ratio; |
85 | extern int sysctl_panic_on_oom; | ||
86 | extern int sysctl_oom_kill_allocating_task; | ||
87 | extern int sysctl_oom_dump_tasks; | ||
88 | extern int max_threads; | 90 | extern int max_threads; |
89 | extern int core_uses_pid; | 91 | extern int core_uses_pid; |
90 | extern int suid_dumpable; | 92 | extern int suid_dumpable; |
@@ -106,7 +108,7 @@ extern int blk_iopoll_enabled; | |||
106 | #endif | 108 | #endif |
107 | 109 | ||
108 | /* Constants used for minimum and maximum */ | 110 | /* Constants used for minimum and maximum */ |
109 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 111 | #ifdef CONFIG_LOCKUP_DETECTOR |
110 | static int sixty = 60; | 112 | static int sixty = 60; |
111 | static int neg_one = -1; | 113 | static int neg_one = -1; |
112 | #endif | 114 | #endif |
@@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8; | |||
130 | 132 | ||
131 | static int ngroups_max = NGROUPS_MAX; | 133 | static int ngroups_max = NGROUPS_MAX; |
132 | 134 | ||
135 | #ifdef CONFIG_INOTIFY_USER | ||
136 | #include <linux/inotify.h> | ||
137 | #endif | ||
133 | #ifdef CONFIG_SPARC | 138 | #ifdef CONFIG_SPARC |
134 | #include <asm/system.h> | 139 | #include <asm/system.h> |
135 | #endif | 140 | #endif |
@@ -206,9 +211,6 @@ static struct ctl_table fs_table[]; | |||
206 | static struct ctl_table debug_table[]; | 211 | static struct ctl_table debug_table[]; |
207 | static struct ctl_table dev_table[]; | 212 | static struct ctl_table dev_table[]; |
208 | extern struct ctl_table random_table[]; | 213 | extern struct ctl_table random_table[]; |
209 | #ifdef CONFIG_INOTIFY_USER | ||
210 | extern struct ctl_table inotify_table[]; | ||
211 | #endif | ||
212 | #ifdef CONFIG_EPOLL | 214 | #ifdef CONFIG_EPOLL |
213 | extern struct ctl_table epoll_table[]; | 215 | extern struct ctl_table epoll_table[]; |
214 | #endif | 216 | #endif |
@@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = { | |||
562 | .extra2 = &one, | 564 | .extra2 = &one, |
563 | }, | 565 | }, |
564 | #endif | 566 | #endif |
565 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 567 | #ifdef CONFIG_HOTPLUG |
566 | { | 568 | { |
567 | .procname = "hotplug", | 569 | .procname = "hotplug", |
568 | .data = &uevent_helper, | 570 | .data = &uevent_helper, |
@@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = { | |||
710 | .mode = 0444, | 712 | .mode = 0444, |
711 | .proc_handler = proc_dointvec, | 713 | .proc_handler = proc_dointvec, |
712 | }, | 714 | }, |
713 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 715 | #if defined(CONFIG_LOCKUP_DETECTOR) |
716 | { | ||
717 | .procname = "watchdog", | ||
718 | .data = &watchdog_enabled, | ||
719 | .maxlen = sizeof (int), | ||
720 | .mode = 0644, | ||
721 | .proc_handler = proc_dowatchdog_enabled, | ||
722 | }, | ||
723 | { | ||
724 | .procname = "watchdog_thresh", | ||
725 | .data = &softlockup_thresh, | ||
726 | .maxlen = sizeof(int), | ||
727 | .mode = 0644, | ||
728 | .proc_handler = proc_dowatchdog_thresh, | ||
729 | .extra1 = &neg_one, | ||
730 | .extra2 = &sixty, | ||
731 | }, | ||
732 | { | ||
733 | .procname = "softlockup_panic", | ||
734 | .data = &softlockup_panic, | ||
735 | .maxlen = sizeof(int), | ||
736 | .mode = 0644, | ||
737 | .proc_handler = proc_dointvec_minmax, | ||
738 | .extra1 = &zero, | ||
739 | .extra2 = &one, | ||
740 | }, | ||
741 | #endif | ||
742 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
714 | { | 743 | { |
715 | .procname = "unknown_nmi_panic", | 744 | .procname = "unknown_nmi_panic", |
716 | .data = &unknown_nmi_panic, | 745 | .data = &unknown_nmi_panic, |
@@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = { | |||
813 | .proc_handler = proc_dointvec, | 842 | .proc_handler = proc_dointvec, |
814 | }, | 843 | }, |
815 | #endif | 844 | #endif |
816 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
817 | { | ||
818 | .procname = "softlockup_panic", | ||
819 | .data = &softlockup_panic, | ||
820 | .maxlen = sizeof(int), | ||
821 | .mode = 0644, | ||
822 | .proc_handler = proc_dointvec_minmax, | ||
823 | .extra1 = &zero, | ||
824 | .extra2 = &one, | ||
825 | }, | ||
826 | { | ||
827 | .procname = "softlockup_thresh", | ||
828 | .data = &softlockup_thresh, | ||
829 | .maxlen = sizeof(int), | ||
830 | .mode = 0644, | ||
831 | .proc_handler = proc_dosoftlockup_thresh, | ||
832 | .extra1 = &neg_one, | ||
833 | .extra2 = &sixty, | ||
834 | }, | ||
835 | #endif | ||
836 | #ifdef CONFIG_DETECT_HUNG_TASK | 845 | #ifdef CONFIG_DETECT_HUNG_TASK |
837 | { | 846 | { |
838 | .procname = "hung_task_panic", | 847 | .procname = "hung_task_panic", |
@@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = { | |||
906 | .proc_handler = proc_dointvec, | 915 | .proc_handler = proc_dointvec, |
907 | }, | 916 | }, |
908 | #endif | 917 | #endif |
909 | #ifdef CONFIG_SLOW_WORK | ||
910 | { | ||
911 | .procname = "slow-work", | ||
912 | .mode = 0555, | ||
913 | .child = slow_work_sysctls, | ||
914 | }, | ||
915 | #endif | ||
916 | #ifdef CONFIG_PERF_EVENTS | 918 | #ifdef CONFIG_PERF_EVENTS |
917 | { | 919 | { |
918 | .procname = "perf_event_paranoid", | 920 | .procname = "perf_event_paranoid", |
@@ -1711,10 +1713,7 @@ static __init int sysctl_init(void) | |||
1711 | { | 1713 | { |
1712 | sysctl_set_parent(NULL, root_table); | 1714 | sysctl_set_parent(NULL, root_table); |
1713 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK | 1715 | #ifdef CONFIG_SYSCTL_SYSCALL_CHECK |
1714 | { | 1716 | sysctl_check_table(current->nsproxy, root_table); |
1715 | int err; | ||
1716 | err = sysctl_check_table(current->nsproxy, root_table); | ||
1717 | } | ||
1718 | #endif | 1717 | #endif |
1719 | return 0; | 1718 | return 0; |
1720 | } | 1719 | } |
diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09a..ba9b338d1835 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
300 | } | 300 | } |
301 | EXPORT_SYMBOL(timespec_trunc); | 301 | EXPORT_SYMBOL(timespec_trunc); |
302 | 302 | ||
303 | #ifndef CONFIG_GENERIC_TIME | ||
304 | /* | ||
305 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | ||
306 | * and therefore only yields usec accuracy | ||
307 | */ | ||
308 | void getnstimeofday(struct timespec *tv) | ||
309 | { | ||
310 | struct timeval x; | ||
311 | |||
312 | do_gettimeofday(&x); | ||
313 | tv->tv_sec = x.tv_sec; | ||
314 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | ||
315 | } | ||
316 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
317 | #endif | ||
318 | |||
319 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 303 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
320 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 304 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
321 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | 305 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0a..f06a8a365648 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -6,7 +6,7 @@ config TICK_ONESHOT | |||
6 | 6 | ||
7 | config NO_HZ | 7 | config NO_HZ |
8 | bool "Tickless System (Dynamic Ticks)" | 8 | bool "Tickless System (Dynamic Ticks)" |
9 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 9 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
10 | select TICK_ONESHOT | 10 | select TICK_ONESHOT |
11 | help | 11 | help |
12 | This option enables a tickless system: timer interrupts will | 12 | This option enables a tickless system: timer interrupts will |
@@ -15,7 +15,7 @@ config NO_HZ | |||
15 | 15 | ||
16 | config HIGH_RES_TIMERS | 16 | config HIGH_RES_TIMERS |
17 | bool "High Resolution Timer Support" | 17 | bool "High Resolution Timer Support" |
18 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 18 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
19 | select TICK_ONESHOT | 19 | select TICK_ONESHOT |
20 | help | 20 | help |
21 | This option enables high resolution timer support. If your | 21 | This option enables high resolution timer support. If your |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d561..c18d7efa1b4b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
531 | return max_nsecs - (max_nsecs >> 5); | 531 | return max_nsecs - (max_nsecs >> 5); |
532 | } | 532 | } |
533 | 533 | ||
534 | #ifdef CONFIG_GENERIC_TIME | 534 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
535 | 535 | ||
536 | /** | 536 | /** |
537 | * clocksource_select - Select the best clocksource available | 537 | * clocksource_select - Select the best clocksource available |
@@ -577,7 +577,7 @@ static void clocksource_select(void) | |||
577 | } | 577 | } |
578 | } | 578 | } |
579 | 579 | ||
580 | #else /* CONFIG_GENERIC_TIME */ | 580 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ |
581 | 581 | ||
582 | static inline void clocksource_select(void) { } | 582 | static inline void clocksource_select(void) { } |
583 | 583 | ||
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | 639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ |
640 | 640 | ||
641 | /** | 641 | /** |
642 | * __clocksource_register_scale - Used to install new clocksources | 642 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
643 | * @t: clocksource to be registered | 643 | * @t: clocksource to be registered |
644 | * @scale: Scale factor multiplied against freq to get clocksource hz | 644 | * @scale: Scale factor multiplied against freq to get clocksource hz |
645 | * @freq: clocksource frequency (cycles per second) divided by scale | 645 | * @freq: clocksource frequency (cycles per second) divided by scale |
646 | * | 646 | * |
647 | * Returns -EBUSY if registration fails, zero otherwise. | 647 | * This should only be called from the clocksource->enable() method. |
648 | * | 648 | * |
649 | * This *SHOULD NOT* be called directly! Please use the | 649 | * This *SHOULD NOT* be called directly! Please use the |
650 | * clocksource_register_hz() or clocksource_register_khz helper functions. | 650 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. |
651 | */ | 651 | */ |
652 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 652 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
653 | { | 653 | { |
654 | |||
655 | /* | 654 | /* |
656 | * Ideally we want to use some of the limits used in | 655 | * Ideally we want to use some of the limits used in |
657 | * clocksource_max_deferment, to provide a more informed | 656 | * clocksource_max_deferment, to provide a more informed |
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
662 | NSEC_PER_SEC/scale, | 661 | NSEC_PER_SEC/scale, |
663 | MAX_UPDATE_LENGTH*scale); | 662 | MAX_UPDATE_LENGTH*scale); |
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 663 | cs->max_idle_ns = clocksource_max_deferment(cs); |
664 | } | ||
665 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | ||
666 | |||
667 | /** | ||
668 | * __clocksource_register_scale - Used to install new clocksources | ||
669 | * @t: clocksource to be registered | ||
670 | * @scale: Scale factor multiplied against freq to get clocksource hz | ||
671 | * @freq: clocksource frequency (cycles per second) divided by scale | ||
672 | * | ||
673 | * Returns -EBUSY if registration fails, zero otherwise. | ||
674 | * | ||
675 | * This *SHOULD NOT* be called directly! Please use the | ||
676 | * clocksource_register_hz() or clocksource_register_khz helper functions. | ||
677 | */ | ||
678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | ||
679 | { | ||
680 | |||
681 | /* Intialize mult/shift and max_idle_ns */ | ||
682 | __clocksource_updatefreq_scale(cs, scale, freq); | ||
665 | 683 | ||
684 | /* Add clocksource to the clcoksource list */ | ||
666 | mutex_lock(&clocksource_mutex); | 685 | mutex_lock(&clocksource_mutex); |
667 | clocksource_enqueue(cs); | 686 | clocksource_enqueue(cs); |
668 | clocksource_select(); | 687 | clocksource_select(); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb61..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
325 | } while (read_seqretry(&xtime_lock, seq)); | 325 | } while (read_seqretry(&xtime_lock, seq)); |
326 | 326 | ||
327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || |
328 | arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { | 328 | arch_needs_cpu(cpu)) { |
329 | next_jiffies = last_jiffies + 1; | 329 | next_jiffies = last_jiffies + 1; |
330 | delta_jiffies = 1; | 330 | delta_jiffies = 1; |
331 | } else { | 331 | } else { |
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
405 | * the scheduler tick in nohz_restart_sched_tick. | 405 | * the scheduler tick in nohz_restart_sched_tick. |
406 | */ | 406 | */ |
407 | if (!ts->tick_stopped) { | 407 | if (!ts->tick_stopped) { |
408 | if (select_nohz_load_balancer(1)) { | 408 | select_nohz_load_balancer(1); |
409 | /* | ||
410 | * sched tick not stopped! | ||
411 | */ | ||
412 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
413 | goto out; | ||
414 | } | ||
415 | 409 | ||
416 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 410 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
417 | ts->tick_stopped = 1; | 411 | ts->tick_stopped = 1; |
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) | |||
780 | { | 774 | { |
781 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 775 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
782 | ktime_t now = ktime_get(); | 776 | ktime_t now = ktime_get(); |
783 | u64 offset; | ||
784 | 777 | ||
785 | /* | 778 | /* |
786 | * Emulate tick processing via per-CPU hrtimers: | 779 | * Emulate tick processing via per-CPU hrtimers: |
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) | |||
790 | 783 | ||
791 | /* Get the next period (per cpu) */ | 784 | /* Get the next period (per cpu) */ |
792 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 785 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
793 | offset = ktime_to_ns(tick_period) >> 1; | ||
794 | do_div(offset, num_possible_cpus()); | ||
795 | offset *= smp_processor_id(); | ||
796 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
797 | 786 | ||
798 | for (;;) { | 787 | for (;;) { |
799 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 788 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c8..49010d822f72 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | |||
153 | * - wall_to_monotonic is no longer the boot time, getboottime must be | 153 | * - wall_to_monotonic is no longer the boot time, getboottime must be |
154 | * used instead. | 154 | * used instead. |
155 | */ | 155 | */ |
156 | struct timespec xtime __attribute__ ((aligned (16))); | 156 | static struct timespec xtime __attribute__ ((aligned (16))); |
157 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 157 | static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
158 | static struct timespec total_sleep_time; | 158 | static struct timespec total_sleep_time; |
159 | 159 | ||
160 | /* | 160 | /* |
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) | |||
170 | { | 170 | { |
171 | xtime.tv_sec += leapsecond; | 171 | xtime.tv_sec += leapsecond; |
172 | wall_to_monotonic.tv_sec -= leapsecond; | 172 | wall_to_monotonic.tv_sec -= leapsecond; |
173 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 173 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
174 | timekeeper.mult); | ||
174 | } | 175 | } |
175 | 176 | ||
176 | #ifdef CONFIG_GENERIC_TIME | ||
177 | |||
178 | /** | 177 | /** |
179 | * timekeeping_forward_now - update clock to the current time | 178 | * timekeeping_forward_now - update clock to the current time |
180 | * | 179 | * |
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) | |||
328 | timekeeper.ntp_error = 0; | 327 | timekeeper.ntp_error = 0; |
329 | ntp_clear(); | 328 | ntp_clear(); |
330 | 329 | ||
331 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 330 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
331 | timekeeper.mult); | ||
332 | 332 | ||
333 | write_sequnlock_irqrestore(&xtime_lock, flags); | 333 | write_sequnlock_irqrestore(&xtime_lock, flags); |
334 | 334 | ||
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) | |||
376 | tick_clock_notify(); | 376 | tick_clock_notify(); |
377 | } | 377 | } |
378 | 378 | ||
379 | #else /* GENERIC_TIME */ | ||
380 | |||
381 | static inline void timekeeping_forward_now(void) { } | ||
382 | |||
383 | /** | ||
384 | * ktime_get - get the monotonic time in ktime_t format | ||
385 | * | ||
386 | * returns the time in ktime_t format | ||
387 | */ | ||
388 | ktime_t ktime_get(void) | ||
389 | { | ||
390 | struct timespec now; | ||
391 | |||
392 | ktime_get_ts(&now); | ||
393 | |||
394 | return timespec_to_ktime(now); | ||
395 | } | ||
396 | EXPORT_SYMBOL_GPL(ktime_get); | ||
397 | |||
398 | /** | ||
399 | * ktime_get_ts - get the monotonic clock in timespec format | ||
400 | * @ts: pointer to timespec variable | ||
401 | * | ||
402 | * The function calculates the monotonic clock from the realtime | ||
403 | * clock and the wall_to_monotonic offset and stores the result | ||
404 | * in normalized timespec format in the variable pointed to by @ts. | ||
405 | */ | ||
406 | void ktime_get_ts(struct timespec *ts) | ||
407 | { | ||
408 | struct timespec tomono; | ||
409 | unsigned long seq; | ||
410 | |||
411 | do { | ||
412 | seq = read_seqbegin(&xtime_lock); | ||
413 | getnstimeofday(ts); | ||
414 | tomono = wall_to_monotonic; | ||
415 | |||
416 | } while (read_seqretry(&xtime_lock, seq)); | ||
417 | |||
418 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | ||
419 | ts->tv_nsec + tomono.tv_nsec); | ||
420 | } | ||
421 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
422 | |||
423 | #endif /* !GENERIC_TIME */ | ||
424 | |||
425 | /** | 379 | /** |
426 | * ktime_get_real - get the real (wall-) time in ktime_t format | 380 | * ktime_get_real - get the real (wall-) time in ktime_t format |
427 | * | 381 | * |
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) | |||
579 | 533 | ||
580 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 534 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
581 | ts = timespec_sub(ts, timekeeping_suspend_time); | 535 | ts = timespec_sub(ts, timekeeping_suspend_time); |
582 | xtime = timespec_add_safe(xtime, ts); | 536 | xtime = timespec_add(xtime, ts); |
583 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | 537 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); |
584 | total_sleep_time = timespec_add_safe(total_sleep_time, ts); | 538 | total_sleep_time = timespec_add(total_sleep_time, ts); |
585 | } | 539 | } |
586 | /* re-base the last cycle value */ | 540 | /* re-base the last cycle value */ |
587 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 541 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset) | |||
736 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | 690 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) |
737 | { | 691 | { |
738 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | 692 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; |
693 | u64 raw_nsecs; | ||
739 | 694 | ||
740 | /* If the offset is smaller then a shifted interval, do nothing */ | 695 | /* If the offset is smaller then a shifted interval, do nothing */ |
741 | if (offset < timekeeper.cycle_interval<<shift) | 696 | if (offset < timekeeper.cycle_interval<<shift) |
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
752 | second_overflow(); | 707 | second_overflow(); |
753 | } | 708 | } |
754 | 709 | ||
755 | /* Accumulate into raw time */ | 710 | /* Accumulate raw time */ |
756 | raw_time.tv_nsec += timekeeper.raw_interval << shift;; | 711 | raw_nsecs = timekeeper.raw_interval << shift; |
757 | while (raw_time.tv_nsec >= NSEC_PER_SEC) { | 712 | raw_nsecs += raw_time.tv_nsec; |
758 | raw_time.tv_nsec -= NSEC_PER_SEC; | 713 | if (raw_nsecs >= NSEC_PER_SEC) { |
759 | raw_time.tv_sec++; | 714 | u64 raw_secs = raw_nsecs; |
715 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | ||
716 | raw_time.tv_sec += raw_secs; | ||
760 | } | 717 | } |
718 | raw_time.tv_nsec = raw_nsecs; | ||
761 | 719 | ||
762 | /* Accumulate error between NTP and clock interval */ | 720 | /* Accumulate error between NTP and clock interval */ |
763 | timekeeper.ntp_error += tick_length << shift; | 721 | timekeeper.ntp_error += tick_length << shift; |
@@ -784,10 +742,11 @@ void update_wall_time(void) | |||
784 | return; | 742 | return; |
785 | 743 | ||
786 | clock = timekeeper.clock; | 744 | clock = timekeeper.clock; |
787 | #ifdef CONFIG_GENERIC_TIME | 745 | |
788 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 746 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
789 | #else | ||
790 | offset = timekeeper.cycle_interval; | 747 | offset = timekeeper.cycle_interval; |
748 | #else | ||
749 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | ||
791 | #endif | 750 | #endif |
792 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; | 751 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; |
793 | 752 | ||
@@ -856,7 +815,8 @@ void update_wall_time(void) | |||
856 | } | 815 | } |
857 | 816 | ||
858 | /* check to see if there is a new clocksource to use */ | 817 | /* check to see if there is a new clocksource to use */ |
859 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 818 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
819 | timekeeper.mult); | ||
860 | } | 820 | } |
861 | 821 | ||
862 | /** | 822 | /** |
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime); | |||
887 | */ | 847 | */ |
888 | void monotonic_to_bootbased(struct timespec *ts) | 848 | void monotonic_to_bootbased(struct timespec *ts) |
889 | { | 849 | { |
890 | *ts = timespec_add_safe(*ts, total_sleep_time); | 850 | *ts = timespec_add(*ts, total_sleep_time); |
891 | } | 851 | } |
892 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 852 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); |
893 | 853 | ||
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void) | |||
902 | return xtime; | 862 | return xtime; |
903 | } | 863 | } |
904 | 864 | ||
865 | struct timespec __get_wall_to_monotonic(void) | ||
866 | { | ||
867 | return wall_to_monotonic; | ||
868 | } | ||
869 | |||
905 | struct timespec current_kernel_time(void) | 870 | struct timespec current_kernel_time(void) |
906 | { | 871 | { |
907 | struct timespec now; | 872 | struct timespec now; |
diff --git a/kernel/timer.c b/kernel/timer.c index efde11e197c4..97bf05baade7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
93 | * base in timer_list is guaranteed to be zero. Use the LSB for | 93 | * base in timer_list is guaranteed to be zero. Use the LSB to |
94 | * the new flag to indicate whether the timer is deferrable | 94 | * indicate whether the timer is deferrable. |
95 | * | ||
96 | * A deferrable timer will work normally when the system is busy, but | ||
97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
99 | * subsequent non-deferrable timer. | ||
95 | */ | 100 | */ |
96 | #define TBASE_DEFERRABLE_FLAG (0x1) | 101 | #define TBASE_DEFERRABLE_FLAG (0x1) |
97 | 102 | ||
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); | |||
321 | 326 | ||
322 | /** | 327 | /** |
323 | * set_timer_slack - set the allowed slack for a timer | 328 | * set_timer_slack - set the allowed slack for a timer |
329 | * @timer: the timer to be modified | ||
324 | * @slack_hz: the amount of time (in jiffies) allowed for rounding | 330 | * @slack_hz: the amount of time (in jiffies) allowed for rounding |
325 | * | 331 | * |
326 | * Set the amount of time, in jiffies, that a certain timer has | 332 | * Set the amount of time, in jiffies, that a certain timer has |
@@ -692,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
692 | cpu = smp_processor_id(); | 698 | cpu = smp_processor_id(); |
693 | 699 | ||
694 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 700 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) |
695 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { | 701 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
696 | int preferred_cpu = get_nohz_load_balancer(); | 702 | cpu = get_nohz_timer_target(); |
697 | |||
698 | if (preferred_cpu >= 0) | ||
699 | cpu = preferred_cpu; | ||
700 | } | ||
701 | #endif | 703 | #endif |
702 | new_base = per_cpu(tvec_bases, cpu); | 704 | new_base = per_cpu(tvec_bases, cpu); |
703 | 705 | ||
@@ -1302,7 +1304,6 @@ void run_local_timers(void) | |||
1302 | { | 1304 | { |
1303 | hrtimer_run_queues(); | 1305 | hrtimer_run_queues(); |
1304 | raise_softirq(TIMER_SOFTIRQ); | 1306 | raise_softirq(TIMER_SOFTIRQ); |
1305 | softlockup_tick(); | ||
1306 | } | 1307 | } |
1307 | 1308 | ||
1308 | /* | 1309 | /* |
@@ -1763,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
1763 | } | 1764 | } |
1764 | 1765 | ||
1765 | EXPORT_SYMBOL(msleep_interruptible); | 1766 | EXPORT_SYMBOL(msleep_interruptible); |
1767 | |||
1768 | static int __sched do_usleep_range(unsigned long min, unsigned long max) | ||
1769 | { | ||
1770 | ktime_t kmin; | ||
1771 | unsigned long delta; | ||
1772 | |||
1773 | kmin = ktime_set(0, min * NSEC_PER_USEC); | ||
1774 | delta = (max - min) * NSEC_PER_USEC; | ||
1775 | return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); | ||
1776 | } | ||
1777 | |||
1778 | /** | ||
1779 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | ||
1780 | * @min: Minimum time in usecs to sleep | ||
1781 | * @max: Maximum time in usecs to sleep | ||
1782 | */ | ||
1783 | void usleep_range(unsigned long min, unsigned long max) | ||
1784 | { | ||
1785 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
1786 | do_usleep_range(min, max); | ||
1787 | } | ||
1788 | EXPORT_SYMBOL(usleep_range); | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545b..538501c6ea50 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER | |||
153 | bool "Interrupts-off Latency Tracer" | 153 | bool "Interrupts-off Latency Tracer" |
154 | default n | 154 | default n |
155 | depends on TRACE_IRQFLAGS_SUPPORT | 155 | depends on TRACE_IRQFLAGS_SUPPORT |
156 | depends on GENERIC_TIME | 156 | depends on !ARCH_USES_GETTIMEOFFSET |
157 | select TRACE_IRQFLAGS | 157 | select TRACE_IRQFLAGS |
158 | select GENERIC_TRACER | 158 | select GENERIC_TRACER |
159 | select TRACER_MAX_TRACE | 159 | select TRACER_MAX_TRACE |
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER | |||
175 | config PREEMPT_TRACER | 175 | config PREEMPT_TRACER |
176 | bool "Preemption-off Latency Tracer" | 176 | bool "Preemption-off Latency Tracer" |
177 | default n | 177 | default n |
178 | depends on GENERIC_TIME | 178 | depends on !ARCH_USES_GETTIMEOFFSET |
179 | depends on PREEMPT | 179 | depends on PREEMPT |
180 | select GENERIC_TRACER | 180 | select GENERIC_TRACER |
181 | select TRACER_MAX_TRACE | 181 | select TRACER_MAX_TRACE |
@@ -194,15 +194,6 @@ config PREEMPT_TRACER | |||
194 | enabled. This option and the irqs-off timing option can be | 194 | enabled. This option and the irqs-off timing option can be |
195 | used together or separately.) | 195 | used together or separately.) |
196 | 196 | ||
197 | config SYSPROF_TRACER | ||
198 | bool "Sysprof Tracer" | ||
199 | depends on X86 | ||
200 | select GENERIC_TRACER | ||
201 | select CONTEXT_SWITCH_TRACER | ||
202 | help | ||
203 | This tracer provides the trace needed by the 'Sysprof' userspace | ||
204 | tool. | ||
205 | |||
206 | config SCHED_TRACER | 197 | config SCHED_TRACER |
207 | bool "Scheduling Latency Tracer" | 198 | bool "Scheduling Latency Tracer" |
208 | select GENERIC_TRACER | 199 | select GENERIC_TRACER |
@@ -229,23 +220,6 @@ config FTRACE_SYSCALLS | |||
229 | help | 220 | help |
230 | Basic tracer to catch the syscall entry and exit events. | 221 | Basic tracer to catch the syscall entry and exit events. |
231 | 222 | ||
232 | config BOOT_TRACER | ||
233 | bool "Trace boot initcalls" | ||
234 | select GENERIC_TRACER | ||
235 | select CONTEXT_SWITCH_TRACER | ||
236 | help | ||
237 | This tracer helps developers to optimize boot times: it records | ||
238 | the timings of the initcalls and traces key events and the identity | ||
239 | of tasks that can cause boot delays, such as context-switches. | ||
240 | |||
241 | Its aim is to be parsed by the scripts/bootgraph.pl tool to | ||
242 | produce pretty graphics about boot inefficiencies, giving a visual | ||
243 | representation of the delays during initcalls - but the raw | ||
244 | /debug/tracing/trace text output is readable too. | ||
245 | |||
246 | You must pass in initcall_debug and ftrace=initcall to the kernel | ||
247 | command line to enable this on bootup. | ||
248 | |||
249 | config TRACE_BRANCH_PROFILING | 223 | config TRACE_BRANCH_PROFILING |
250 | bool | 224 | bool |
251 | select GENERIC_TRACER | 225 | select GENERIC_TRACER |
@@ -325,28 +299,6 @@ config BRANCH_TRACER | |||
325 | 299 | ||
326 | Say N if unsure. | 300 | Say N if unsure. |
327 | 301 | ||
328 | config KSYM_TRACER | ||
329 | bool "Trace read and write access on kernel memory locations" | ||
330 | depends on HAVE_HW_BREAKPOINT | ||
331 | select TRACING | ||
332 | help | ||
333 | This tracer helps find read and write operations on any given kernel | ||
334 | symbol i.e. /proc/kallsyms. | ||
335 | |||
336 | config PROFILE_KSYM_TRACER | ||
337 | bool "Profile all kernel memory accesses on 'watched' variables" | ||
338 | depends on KSYM_TRACER | ||
339 | help | ||
340 | This tracer profiles kernel accesses on variables watched through the | ||
341 | ksym tracer ftrace plugin. Depending upon the hardware, all read | ||
342 | and write operations on kernel variables can be monitored for | ||
343 | accesses. | ||
344 | |||
345 | The results will be displayed in: | ||
346 | /debugfs/tracing/profile_ksym | ||
347 | |||
348 | Say N if unsure. | ||
349 | |||
350 | config STACK_TRACER | 302 | config STACK_TRACER |
351 | bool "Trace max stack" | 303 | bool "Trace max stack" |
352 | depends on HAVE_FUNCTION_TRACER | 304 | depends on HAVE_FUNCTION_TRACER |
@@ -371,37 +323,6 @@ config STACK_TRACER | |||
371 | 323 | ||
372 | Say N if unsure. | 324 | Say N if unsure. |
373 | 325 | ||
374 | config KMEMTRACE | ||
375 | bool "Trace SLAB allocations" | ||
376 | select GENERIC_TRACER | ||
377 | help | ||
378 | kmemtrace provides tracing for slab allocator functions, such as | ||
379 | kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected | ||
380 | data is then fed to the userspace application in order to analyse | ||
381 | allocation hotspots, internal fragmentation and so on, making it | ||
382 | possible to see how well an allocator performs, as well as debug | ||
383 | and profile kernel code. | ||
384 | |||
385 | This requires an userspace application to use. See | ||
386 | Documentation/trace/kmemtrace.txt for more information. | ||
387 | |||
388 | Saying Y will make the kernel somewhat larger and slower. However, | ||
389 | if you disable kmemtrace at run-time or boot-time, the performance | ||
390 | impact is minimal (depending on the arch the kernel is built for). | ||
391 | |||
392 | If unsure, say N. | ||
393 | |||
394 | config WORKQUEUE_TRACER | ||
395 | bool "Trace workqueues" | ||
396 | select GENERIC_TRACER | ||
397 | help | ||
398 | The workqueue tracer provides some statistical information | ||
399 | about each cpu workqueue thread such as the number of the | ||
400 | works inserted and executed since their creation. It can help | ||
401 | to evaluate the amount of work each of them has to perform. | ||
402 | For example it can help a developer to decide whether he should | ||
403 | choose a per-cpu workqueue instead of a singlethreaded one. | ||
404 | |||
405 | config BLK_DEV_IO_TRACE | 326 | config BLK_DEV_IO_TRACE |
406 | bool "Support for tracing block IO actions" | 327 | bool "Support for tracing block IO actions" |
407 | depends on SYSFS | 328 | depends on SYSFS |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550e..53f338190b26 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o | |||
30 | obj-$(CONFIG_TRACING) += trace_stat.o | 30 | obj-$(CONFIG_TRACING) += trace_stat.o |
31 | obj-$(CONFIG_TRACING) += trace_printk.o | 31 | obj-$(CONFIG_TRACING) += trace_printk.o |
32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o | 32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o |
33 | obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o | ||
34 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | 33 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o |
35 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 34 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
36 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 35 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
@@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | |||
38 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 37 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
39 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 38 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
40 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 39 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
41 | obj-$(CONFIG_BOOT_TRACER) += trace_boot.o | ||
42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 40 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 41 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
44 | obj-$(CONFIG_KMEMTRACE) += kmemtrace.o | ||
45 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | 42 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o |
46 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 43 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
47 | ifeq ($(CONFIG_BLOCK),y) | 44 | ifeq ($(CONFIG_BLOCK),y) |
@@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
55 | endif | 52 | endif |
56 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
57 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
58 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o | ||
59 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o |
56 | ifeq ($(CONFIG_TRACING),y) | ||
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | ||
58 | endif | ||
60 | 59 | ||
61 | libftrace-y := ftrace.o | 60 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 638711c17504..959f8d6c8cc1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, | |||
169 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), | 169 | static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), |
170 | BLK_TC_ACT(BLK_TC_WRITE) }; | 170 | BLK_TC_ACT(BLK_TC_WRITE) }; |
171 | 171 | ||
172 | #define BLK_TC_HARDBARRIER BLK_TC_BARRIER | ||
173 | #define BLK_TC_RAHEAD BLK_TC_AHEAD | ||
174 | |||
172 | /* The ilog2() calls fall out because they're constant */ | 175 | /* The ilog2() calls fall out because they're constant */ |
173 | #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ | 176 | #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ |
174 | (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) | 177 | (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) |
175 | 178 | ||
176 | /* | 179 | /* |
177 | * The worker for the various blk_add_trace*() types. Fills out a | 180 | * The worker for the various blk_add_trace*() types. Fills out a |
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, | |||
194 | return; | 197 | return; |
195 | 198 | ||
196 | what |= ddir_act[rw & WRITE]; | 199 | what |= ddir_act[rw & WRITE]; |
197 | what |= MASK_TC_BIT(rw, BARRIER); | 200 | what |= MASK_TC_BIT(rw, HARDBARRIER); |
198 | what |= MASK_TC_BIT(rw, SYNCIO); | 201 | what |= MASK_TC_BIT(rw, SYNC); |
199 | what |= MASK_TC_BIT(rw, AHEAD); | 202 | what |= MASK_TC_BIT(rw, RAHEAD); |
200 | what |= MASK_TC_BIT(rw, META); | 203 | what |= MASK_TC_BIT(rw, META); |
201 | what |= MASK_TC_BIT(rw, DISCARD); | 204 | what |= MASK_TC_BIT(rw, DISCARD); |
202 | 205 | ||
@@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
549 | } | 552 | } |
550 | EXPORT_SYMBOL_GPL(blk_trace_setup); | 553 | EXPORT_SYMBOL_GPL(blk_trace_setup); |
551 | 554 | ||
555 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | ||
556 | static int compat_blk_trace_setup(struct request_queue *q, char *name, | ||
557 | dev_t dev, struct block_device *bdev, | ||
558 | char __user *arg) | ||
559 | { | ||
560 | struct blk_user_trace_setup buts; | ||
561 | struct compat_blk_user_trace_setup cbuts; | ||
562 | int ret; | ||
563 | |||
564 | if (copy_from_user(&cbuts, arg, sizeof(cbuts))) | ||
565 | return -EFAULT; | ||
566 | |||
567 | buts = (struct blk_user_trace_setup) { | ||
568 | .act_mask = cbuts.act_mask, | ||
569 | .buf_size = cbuts.buf_size, | ||
570 | .buf_nr = cbuts.buf_nr, | ||
571 | .start_lba = cbuts.start_lba, | ||
572 | .end_lba = cbuts.end_lba, | ||
573 | .pid = cbuts.pid, | ||
574 | }; | ||
575 | memcpy(&buts.name, &cbuts.name, 32); | ||
576 | |||
577 | ret = do_blk_trace_setup(q, name, dev, bdev, &buts); | ||
578 | if (ret) | ||
579 | return ret; | ||
580 | |||
581 | if (copy_to_user(arg, &buts.name, 32)) { | ||
582 | blk_trace_remove(q); | ||
583 | return -EFAULT; | ||
584 | } | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | #endif | ||
589 | |||
552 | int blk_trace_startstop(struct request_queue *q, int start) | 590 | int blk_trace_startstop(struct request_queue *q, int start) |
553 | { | 591 | { |
554 | int ret; | 592 | int ret; |
@@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
601 | if (!q) | 639 | if (!q) |
602 | return -ENXIO; | 640 | return -ENXIO; |
603 | 641 | ||
642 | lock_kernel(); | ||
604 | mutex_lock(&bdev->bd_mutex); | 643 | mutex_lock(&bdev->bd_mutex); |
605 | 644 | ||
606 | switch (cmd) { | 645 | switch (cmd) { |
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
608 | bdevname(bdev, b); | 647 | bdevname(bdev, b); |
609 | ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); | 648 | ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); |
610 | break; | 649 | break; |
650 | #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) | ||
651 | case BLKTRACESETUP32: | ||
652 | bdevname(bdev, b); | ||
653 | ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); | ||
654 | break; | ||
655 | #endif | ||
611 | case BLKTRACESTART: | 656 | case BLKTRACESTART: |
612 | start = 1; | 657 | start = 1; |
613 | case BLKTRACESTOP: | 658 | case BLKTRACESTOP: |
@@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) | |||
622 | } | 667 | } |
623 | 668 | ||
624 | mutex_unlock(&bdev->bd_mutex); | 669 | mutex_unlock(&bdev->bd_mutex); |
670 | unlock_kernel(); | ||
625 | return ret; | 671 | return ret; |
626 | } | 672 | } |
627 | 673 | ||
@@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, | |||
661 | if (likely(!bt)) | 707 | if (likely(!bt)) |
662 | return; | 708 | return; |
663 | 709 | ||
664 | if (blk_discard_rq(rq)) | 710 | if (rq->cmd_flags & REQ_DISCARD) |
665 | rw |= (1 << BIO_RW_DISCARD); | 711 | rw |= REQ_DISCARD; |
712 | |||
713 | if (rq->cmd_flags & REQ_SECURE) | ||
714 | rw |= REQ_SECURE; | ||
666 | 715 | ||
667 | if (blk_pc_request(rq)) { | 716 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { |
668 | what |= BLK_TC_ACT(BLK_TC_PC); | 717 | what |= BLK_TC_ACT(BLK_TC_PC); |
669 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, | 718 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, |
670 | what, rq->errors, rq->cmd_len, rq->cmd); | 719 | what, rq->errors, rq->cmd_len, rq->cmd); |
@@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q, | |||
925 | if (likely(!bt)) | 974 | if (likely(!bt)) |
926 | return; | 975 | return; |
927 | 976 | ||
928 | if (blk_pc_request(rq)) | 977 | if (rq->cmd_type == REQ_TYPE_BLOCK_PC) |
929 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, | 978 | __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, |
930 | BLK_TA_DRV_DATA, rq->errors, len, data); | 979 | BLK_TA_DRV_DATA, rq->errors, len, data); |
931 | else | 980 | else |
@@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq) | |||
1730 | int len = rq->cmd_len; | 1779 | int len = rq->cmd_len; |
1731 | unsigned char *cmd = rq->cmd; | 1780 | unsigned char *cmd = rq->cmd; |
1732 | 1781 | ||
1733 | if (!blk_pc_request(rq)) { | 1782 | if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { |
1734 | buf[0] = '\0'; | 1783 | buf[0] = '\0'; |
1735 | return; | 1784 | return; |
1736 | } | 1785 | } |
@@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) | |||
1755 | 1804 | ||
1756 | if (rw & WRITE) | 1805 | if (rw & WRITE) |
1757 | rwbs[i++] = 'W'; | 1806 | rwbs[i++] = 'W'; |
1758 | else if (rw & 1 << BIO_RW_DISCARD) | 1807 | else if (rw & REQ_DISCARD) |
1759 | rwbs[i++] = 'D'; | 1808 | rwbs[i++] = 'D'; |
1760 | else if (bytes) | 1809 | else if (bytes) |
1761 | rwbs[i++] = 'R'; | 1810 | rwbs[i++] = 'R'; |
1762 | else | 1811 | else |
1763 | rwbs[i++] = 'N'; | 1812 | rwbs[i++] = 'N'; |
1764 | 1813 | ||
1765 | if (rw & 1 << BIO_RW_AHEAD) | 1814 | if (rw & REQ_RAHEAD) |
1766 | rwbs[i++] = 'A'; | 1815 | rwbs[i++] = 'A'; |
1767 | if (rw & 1 << BIO_RW_BARRIER) | 1816 | if (rw & REQ_HARDBARRIER) |
1768 | rwbs[i++] = 'B'; | 1817 | rwbs[i++] = 'B'; |
1769 | if (rw & 1 << BIO_RW_SYNCIO) | 1818 | if (rw & REQ_SYNC) |
1770 | rwbs[i++] = 'S'; | 1819 | rwbs[i++] = 'S'; |
1771 | if (rw & 1 << BIO_RW_META) | 1820 | if (rw & REQ_META) |
1772 | rwbs[i++] = 'M'; | 1821 | rwbs[i++] = 'M'; |
1822 | if (rw & REQ_SECURE) | ||
1823 | rwbs[i++] = 'E'; | ||
1773 | 1824 | ||
1774 | rwbs[i] = '\0'; | 1825 | rwbs[i] = '\0'; |
1775 | } | 1826 | } |
@@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) | |||
1779 | int rw = rq->cmd_flags & 0x03; | 1830 | int rw = rq->cmd_flags & 0x03; |
1780 | int bytes; | 1831 | int bytes; |
1781 | 1832 | ||
1782 | if (blk_discard_rq(rq)) | 1833 | if (rq->cmd_flags & REQ_DISCARD) |
1783 | rw |= (1 << BIO_RW_DISCARD); | 1834 | rw |= REQ_DISCARD; |
1835 | |||
1836 | if (rq->cmd_flags & REQ_SECURE) | ||
1837 | rw |= REQ_SECURE; | ||
1784 | 1838 | ||
1785 | bytes = blk_rq_bytes(rq); | 1839 | bytes = blk_rq_bytes(rq); |
1786 | 1840 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..fa7ece649fe1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
381 | { | 381 | { |
382 | struct ftrace_profile *rec = v; | 382 | struct ftrace_profile *rec = v; |
383 | char str[KSYM_SYMBOL_LEN]; | 383 | char str[KSYM_SYMBOL_LEN]; |
384 | int ret = 0; | ||
384 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 385 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
385 | static DEFINE_MUTEX(mutex); | ||
386 | static struct trace_seq s; | 386 | static struct trace_seq s; |
387 | unsigned long long avg; | 387 | unsigned long long avg; |
388 | unsigned long long stddev; | 388 | unsigned long long stddev; |
389 | #endif | 389 | #endif |
390 | mutex_lock(&ftrace_profile_lock); | ||
391 | |||
392 | /* we raced with function_profile_reset() */ | ||
393 | if (unlikely(rec->counter == 0)) { | ||
394 | ret = -EBUSY; | ||
395 | goto out; | ||
396 | } | ||
390 | 397 | ||
391 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | 398 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); |
392 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); | 399 | seq_printf(m, " %-30.30s %10lu", str, rec->counter); |
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
408 | do_div(stddev, (rec->counter - 1) * 1000); | 415 | do_div(stddev, (rec->counter - 1) * 1000); |
409 | } | 416 | } |
410 | 417 | ||
411 | mutex_lock(&mutex); | ||
412 | trace_seq_init(&s); | 418 | trace_seq_init(&s); |
413 | trace_print_graph_duration(rec->time, &s); | 419 | trace_print_graph_duration(rec->time, &s); |
414 | trace_seq_puts(&s, " "); | 420 | trace_seq_puts(&s, " "); |
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) | |||
416 | trace_seq_puts(&s, " "); | 422 | trace_seq_puts(&s, " "); |
417 | trace_print_graph_duration(stddev, &s); | 423 | trace_print_graph_duration(stddev, &s); |
418 | trace_print_seq(m, &s); | 424 | trace_print_seq(m, &s); |
419 | mutex_unlock(&mutex); | ||
420 | #endif | 425 | #endif |
421 | seq_putc(m, '\n'); | 426 | seq_putc(m, '\n'); |
427 | out: | ||
428 | mutex_unlock(&ftrace_profile_lock); | ||
422 | 429 | ||
423 | return 0; | 430 | return ret; |
424 | } | 431 | } |
425 | 432 | ||
426 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) | 433 | static void ftrace_profile_reset(struct ftrace_profile_stat *stat) |
@@ -1503,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1503 | if (*pos > 0) | 1510 | if (*pos > 0) |
1504 | return t_hash_start(m, pos); | 1511 | return t_hash_start(m, pos); |
1505 | iter->flags |= FTRACE_ITER_PRINTALL; | 1512 | iter->flags |= FTRACE_ITER_PRINTALL; |
1513 | /* reset in case of seek/pread */ | ||
1514 | iter->flags &= ~FTRACE_ITER_HASH; | ||
1506 | return iter; | 1515 | return iter; |
1507 | } | 1516 | } |
1508 | 1517 | ||
@@ -1883,7 +1892,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1883 | struct hlist_head *hhd; | 1892 | struct hlist_head *hhd; |
1884 | struct hlist_node *n; | 1893 | struct hlist_node *n; |
1885 | unsigned long key; | 1894 | unsigned long key; |
1886 | int resched; | ||
1887 | 1895 | ||
1888 | key = hash_long(ip, FTRACE_HASH_BITS); | 1896 | key = hash_long(ip, FTRACE_HASH_BITS); |
1889 | 1897 | ||
@@ -1897,12 +1905,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
1897 | * period. This syncs the hash iteration and freeing of items | 1905 | * period. This syncs the hash iteration and freeing of items |
1898 | * on the hash. rcu_read_lock is too dangerous here. | 1906 | * on the hash. rcu_read_lock is too dangerous here. |
1899 | */ | 1907 | */ |
1900 | resched = ftrace_preempt_disable(); | 1908 | preempt_disable_notrace(); |
1901 | hlist_for_each_entry_rcu(entry, n, hhd, node) { | 1909 | hlist_for_each_entry_rcu(entry, n, hhd, node) { |
1902 | if (entry->ip == ip) | 1910 | if (entry->ip == ip) |
1903 | entry->ops->func(ip, parent_ip, &entry->data); | 1911 | entry->ops->func(ip, parent_ip, &entry->data); |
1904 | } | 1912 | } |
1905 | ftrace_preempt_enable(resched); | 1913 | preempt_enable_notrace(); |
1906 | } | 1914 | } |
1907 | 1915 | ||
1908 | static struct ftrace_ops trace_probe_ops __read_mostly = | 1916 | static struct ftrace_ops trace_probe_ops __read_mostly = |
@@ -2410,7 +2418,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
2410 | .open = ftrace_filter_open, | 2418 | .open = ftrace_filter_open, |
2411 | .read = seq_read, | 2419 | .read = seq_read, |
2412 | .write = ftrace_filter_write, | 2420 | .write = ftrace_filter_write, |
2413 | .llseek = ftrace_regex_lseek, | 2421 | .llseek = no_llseek, |
2414 | .release = ftrace_filter_release, | 2422 | .release = ftrace_filter_release, |
2415 | }; | 2423 | }; |
2416 | 2424 | ||
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660b..000000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null | |||
@@ -1,529 +0,0 @@ | |||
1 | /* | ||
2 | * Memory allocator tracing | ||
3 | * | ||
4 | * Copyright (C) 2008 Eduard - Gabriel Munteanu | ||
5 | * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> | ||
6 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
7 | */ | ||
8 | |||
9 | #include <linux/tracepoint.h> | ||
10 | #include <linux/seq_file.h> | ||
11 | #include <linux/debugfs.h> | ||
12 | #include <linux/dcache.h> | ||
13 | #include <linux/fs.h> | ||
14 | |||
15 | #include <linux/kmemtrace.h> | ||
16 | |||
17 | #include "trace_output.h" | ||
18 | #include "trace.h" | ||
19 | |||
20 | /* Select an alternative, minimalistic output than the original one */ | ||
21 | #define TRACE_KMEM_OPT_MINIMAL 0x1 | ||
22 | |||
23 | static struct tracer_opt kmem_opts[] = { | ||
24 | /* Default disable the minimalistic output */ | ||
25 | { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, | ||
26 | { } | ||
27 | }; | ||
28 | |||
29 | static struct tracer_flags kmem_tracer_flags = { | ||
30 | .val = 0, | ||
31 | .opts = kmem_opts | ||
32 | }; | ||
33 | |||
34 | static struct trace_array *kmemtrace_array; | ||
35 | |||
36 | /* Trace allocations */ | ||
37 | static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, | ||
38 | unsigned long call_site, | ||
39 | const void *ptr, | ||
40 | size_t bytes_req, | ||
41 | size_t bytes_alloc, | ||
42 | gfp_t gfp_flags, | ||
43 | int node) | ||
44 | { | ||
45 | struct ftrace_event_call *call = &event_kmem_alloc; | ||
46 | struct trace_array *tr = kmemtrace_array; | ||
47 | struct kmemtrace_alloc_entry *entry; | ||
48 | struct ring_buffer_event *event; | ||
49 | |||
50 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
51 | if (!event) | ||
52 | return; | ||
53 | |||
54 | entry = ring_buffer_event_data(event); | ||
55 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
56 | |||
57 | entry->ent.type = TRACE_KMEM_ALLOC; | ||
58 | entry->type_id = type_id; | ||
59 | entry->call_site = call_site; | ||
60 | entry->ptr = ptr; | ||
61 | entry->bytes_req = bytes_req; | ||
62 | entry->bytes_alloc = bytes_alloc; | ||
63 | entry->gfp_flags = gfp_flags; | ||
64 | entry->node = node; | ||
65 | |||
66 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
67 | ring_buffer_unlock_commit(tr->buffer, event); | ||
68 | |||
69 | trace_wake_up(); | ||
70 | } | ||
71 | |||
72 | static inline void kmemtrace_free(enum kmemtrace_type_id type_id, | ||
73 | unsigned long call_site, | ||
74 | const void *ptr) | ||
75 | { | ||
76 | struct ftrace_event_call *call = &event_kmem_free; | ||
77 | struct trace_array *tr = kmemtrace_array; | ||
78 | struct kmemtrace_free_entry *entry; | ||
79 | struct ring_buffer_event *event; | ||
80 | |||
81 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
82 | if (!event) | ||
83 | return; | ||
84 | entry = ring_buffer_event_data(event); | ||
85 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
86 | |||
87 | entry->ent.type = TRACE_KMEM_FREE; | ||
88 | entry->type_id = type_id; | ||
89 | entry->call_site = call_site; | ||
90 | entry->ptr = ptr; | ||
91 | |||
92 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
93 | ring_buffer_unlock_commit(tr->buffer, event); | ||
94 | |||
95 | trace_wake_up(); | ||
96 | } | ||
97 | |||
98 | static void kmemtrace_kmalloc(void *ignore, | ||
99 | unsigned long call_site, | ||
100 | const void *ptr, | ||
101 | size_t bytes_req, | ||
102 | size_t bytes_alloc, | ||
103 | gfp_t gfp_flags) | ||
104 | { | ||
105 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
106 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
107 | } | ||
108 | |||
109 | static void kmemtrace_kmem_cache_alloc(void *ignore, | ||
110 | unsigned long call_site, | ||
111 | const void *ptr, | ||
112 | size_t bytes_req, | ||
113 | size_t bytes_alloc, | ||
114 | gfp_t gfp_flags) | ||
115 | { | ||
116 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
117 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
118 | } | ||
119 | |||
120 | static void kmemtrace_kmalloc_node(void *ignore, | ||
121 | unsigned long call_site, | ||
122 | const void *ptr, | ||
123 | size_t bytes_req, | ||
124 | size_t bytes_alloc, | ||
125 | gfp_t gfp_flags, | ||
126 | int node) | ||
127 | { | ||
128 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
129 | bytes_req, bytes_alloc, gfp_flags, node); | ||
130 | } | ||
131 | |||
132 | static void kmemtrace_kmem_cache_alloc_node(void *ignore, | ||
133 | unsigned long call_site, | ||
134 | const void *ptr, | ||
135 | size_t bytes_req, | ||
136 | size_t bytes_alloc, | ||
137 | gfp_t gfp_flags, | ||
138 | int node) | ||
139 | { | ||
140 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
141 | bytes_req, bytes_alloc, gfp_flags, node); | ||
142 | } | ||
143 | |||
144 | static void | ||
145 | kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) | ||
146 | { | ||
147 | kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); | ||
148 | } | ||
149 | |||
150 | static void kmemtrace_kmem_cache_free(void *ignore, | ||
151 | unsigned long call_site, const void *ptr) | ||
152 | { | ||
153 | kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); | ||
154 | } | ||
155 | |||
156 | static int kmemtrace_start_probes(void) | ||
157 | { | ||
158 | int err; | ||
159 | |||
160 | err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
161 | if (err) | ||
162 | return err; | ||
163 | err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
164 | if (err) | ||
165 | return err; | ||
166 | err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
167 | if (err) | ||
168 | return err; | ||
169 | err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
170 | if (err) | ||
171 | return err; | ||
172 | err = register_trace_kfree(kmemtrace_kfree, NULL); | ||
173 | if (err) | ||
174 | return err; | ||
175 | err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
176 | |||
177 | return err; | ||
178 | } | ||
179 | |||
180 | static void kmemtrace_stop_probes(void) | ||
181 | { | ||
182 | unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
183 | unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
184 | unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
185 | unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
186 | unregister_trace_kfree(kmemtrace_kfree, NULL); | ||
187 | unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
188 | } | ||
189 | |||
190 | static int kmem_trace_init(struct trace_array *tr) | ||
191 | { | ||
192 | kmemtrace_array = tr; | ||
193 | |||
194 | tracing_reset_online_cpus(tr); | ||
195 | |||
196 | kmemtrace_start_probes(); | ||
197 | |||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void kmem_trace_reset(struct trace_array *tr) | ||
202 | { | ||
203 | kmemtrace_stop_probes(); | ||
204 | } | ||
205 | |||
206 | static void kmemtrace_headers(struct seq_file *s) | ||
207 | { | ||
208 | /* Don't need headers for the original kmemtrace output */ | ||
209 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
210 | return; | ||
211 | |||
212 | seq_printf(s, "#\n"); | ||
213 | seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " | ||
214 | " POINTER NODE CALLER\n"); | ||
215 | seq_printf(s, "# FREE | | | | " | ||
216 | " | | | |\n"); | ||
217 | seq_printf(s, "# |\n\n"); | ||
218 | } | ||
219 | |||
220 | /* | ||
221 | * The following functions give the original output from kmemtrace, | ||
222 | * plus the origin CPU, since reordering occurs in-kernel now. | ||
223 | */ | ||
224 | |||
225 | #define KMEMTRACE_USER_ALLOC 0 | ||
226 | #define KMEMTRACE_USER_FREE 1 | ||
227 | |||
228 | struct kmemtrace_user_event { | ||
229 | u8 event_id; | ||
230 | u8 type_id; | ||
231 | u16 event_size; | ||
232 | u32 cpu; | ||
233 | u64 timestamp; | ||
234 | unsigned long call_site; | ||
235 | unsigned long ptr; | ||
236 | }; | ||
237 | |||
238 | struct kmemtrace_user_event_alloc { | ||
239 | size_t bytes_req; | ||
240 | size_t bytes_alloc; | ||
241 | unsigned gfp_flags; | ||
242 | int node; | ||
243 | }; | ||
244 | |||
245 | static enum print_line_t | ||
246 | kmemtrace_print_alloc(struct trace_iterator *iter, int flags, | ||
247 | struct trace_event *event) | ||
248 | { | ||
249 | struct trace_seq *s = &iter->seq; | ||
250 | struct kmemtrace_alloc_entry *entry; | ||
251 | int ret; | ||
252 | |||
253 | trace_assign_type(entry, iter->ent); | ||
254 | |||
255 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " | ||
256 | "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", | ||
257 | entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, | ||
258 | (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, | ||
259 | (unsigned long)entry->gfp_flags, entry->node); | ||
260 | |||
261 | if (!ret) | ||
262 | return TRACE_TYPE_PARTIAL_LINE; | ||
263 | return TRACE_TYPE_HANDLED; | ||
264 | } | ||
265 | |||
266 | static enum print_line_t | ||
267 | kmemtrace_print_free(struct trace_iterator *iter, int flags, | ||
268 | struct trace_event *event) | ||
269 | { | ||
270 | struct trace_seq *s = &iter->seq; | ||
271 | struct kmemtrace_free_entry *entry; | ||
272 | int ret; | ||
273 | |||
274 | trace_assign_type(entry, iter->ent); | ||
275 | |||
276 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", | ||
277 | entry->type_id, (void *)entry->call_site, | ||
278 | (unsigned long)entry->ptr); | ||
279 | |||
280 | if (!ret) | ||
281 | return TRACE_TYPE_PARTIAL_LINE; | ||
282 | return TRACE_TYPE_HANDLED; | ||
283 | } | ||
284 | |||
285 | static enum print_line_t | ||
286 | kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, | ||
287 | struct trace_event *event) | ||
288 | { | ||
289 | struct trace_seq *s = &iter->seq; | ||
290 | struct kmemtrace_alloc_entry *entry; | ||
291 | struct kmemtrace_user_event *ev; | ||
292 | struct kmemtrace_user_event_alloc *ev_alloc; | ||
293 | |||
294 | trace_assign_type(entry, iter->ent); | ||
295 | |||
296 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
297 | if (!ev) | ||
298 | return TRACE_TYPE_PARTIAL_LINE; | ||
299 | |||
300 | ev->event_id = KMEMTRACE_USER_ALLOC; | ||
301 | ev->type_id = entry->type_id; | ||
302 | ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); | ||
303 | ev->cpu = iter->cpu; | ||
304 | ev->timestamp = iter->ts; | ||
305 | ev->call_site = entry->call_site; | ||
306 | ev->ptr = (unsigned long)entry->ptr; | ||
307 | |||
308 | ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); | ||
309 | if (!ev_alloc) | ||
310 | return TRACE_TYPE_PARTIAL_LINE; | ||
311 | |||
312 | ev_alloc->bytes_req = entry->bytes_req; | ||
313 | ev_alloc->bytes_alloc = entry->bytes_alloc; | ||
314 | ev_alloc->gfp_flags = entry->gfp_flags; | ||
315 | ev_alloc->node = entry->node; | ||
316 | |||
317 | return TRACE_TYPE_HANDLED; | ||
318 | } | ||
319 | |||
320 | static enum print_line_t | ||
321 | kmemtrace_print_free_user(struct trace_iterator *iter, int flags, | ||
322 | struct trace_event *event) | ||
323 | { | ||
324 | struct trace_seq *s = &iter->seq; | ||
325 | struct kmemtrace_free_entry *entry; | ||
326 | struct kmemtrace_user_event *ev; | ||
327 | |||
328 | trace_assign_type(entry, iter->ent); | ||
329 | |||
330 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
331 | if (!ev) | ||
332 | return TRACE_TYPE_PARTIAL_LINE; | ||
333 | |||
334 | ev->event_id = KMEMTRACE_USER_FREE; | ||
335 | ev->type_id = entry->type_id; | ||
336 | ev->event_size = sizeof(*ev); | ||
337 | ev->cpu = iter->cpu; | ||
338 | ev->timestamp = iter->ts; | ||
339 | ev->call_site = entry->call_site; | ||
340 | ev->ptr = (unsigned long)entry->ptr; | ||
341 | |||
342 | return TRACE_TYPE_HANDLED; | ||
343 | } | ||
344 | |||
345 | /* The two other following provide a more minimalistic output */ | ||
346 | static enum print_line_t | ||
347 | kmemtrace_print_alloc_compress(struct trace_iterator *iter) | ||
348 | { | ||
349 | struct kmemtrace_alloc_entry *entry; | ||
350 | struct trace_seq *s = &iter->seq; | ||
351 | int ret; | ||
352 | |||
353 | trace_assign_type(entry, iter->ent); | ||
354 | |||
355 | /* Alloc entry */ | ||
356 | ret = trace_seq_printf(s, " + "); | ||
357 | if (!ret) | ||
358 | return TRACE_TYPE_PARTIAL_LINE; | ||
359 | |||
360 | /* Type */ | ||
361 | switch (entry->type_id) { | ||
362 | case KMEMTRACE_TYPE_KMALLOC: | ||
363 | ret = trace_seq_printf(s, "K "); | ||
364 | break; | ||
365 | case KMEMTRACE_TYPE_CACHE: | ||
366 | ret = trace_seq_printf(s, "C "); | ||
367 | break; | ||
368 | case KMEMTRACE_TYPE_PAGES: | ||
369 | ret = trace_seq_printf(s, "P "); | ||
370 | break; | ||
371 | default: | ||
372 | ret = trace_seq_printf(s, "? "); | ||
373 | } | ||
374 | |||
375 | if (!ret) | ||
376 | return TRACE_TYPE_PARTIAL_LINE; | ||
377 | |||
378 | /* Requested */ | ||
379 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); | ||
380 | if (!ret) | ||
381 | return TRACE_TYPE_PARTIAL_LINE; | ||
382 | |||
383 | /* Allocated */ | ||
384 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); | ||
385 | if (!ret) | ||
386 | return TRACE_TYPE_PARTIAL_LINE; | ||
387 | |||
388 | /* Flags | ||
389 | * TODO: would be better to see the name of the GFP flag names | ||
390 | */ | ||
391 | ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); | ||
392 | if (!ret) | ||
393 | return TRACE_TYPE_PARTIAL_LINE; | ||
394 | |||
395 | /* Pointer to allocated */ | ||
396 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
397 | if (!ret) | ||
398 | return TRACE_TYPE_PARTIAL_LINE; | ||
399 | |||
400 | /* Node and call site*/ | ||
401 | ret = trace_seq_printf(s, "%4d %pf\n", entry->node, | ||
402 | (void *)entry->call_site); | ||
403 | if (!ret) | ||
404 | return TRACE_TYPE_PARTIAL_LINE; | ||
405 | |||
406 | return TRACE_TYPE_HANDLED; | ||
407 | } | ||
408 | |||
409 | static enum print_line_t | ||
410 | kmemtrace_print_free_compress(struct trace_iterator *iter) | ||
411 | { | ||
412 | struct kmemtrace_free_entry *entry; | ||
413 | struct trace_seq *s = &iter->seq; | ||
414 | int ret; | ||
415 | |||
416 | trace_assign_type(entry, iter->ent); | ||
417 | |||
418 | /* Free entry */ | ||
419 | ret = trace_seq_printf(s, " - "); | ||
420 | if (!ret) | ||
421 | return TRACE_TYPE_PARTIAL_LINE; | ||
422 | |||
423 | /* Type */ | ||
424 | switch (entry->type_id) { | ||
425 | case KMEMTRACE_TYPE_KMALLOC: | ||
426 | ret = trace_seq_printf(s, "K "); | ||
427 | break; | ||
428 | case KMEMTRACE_TYPE_CACHE: | ||
429 | ret = trace_seq_printf(s, "C "); | ||
430 | break; | ||
431 | case KMEMTRACE_TYPE_PAGES: | ||
432 | ret = trace_seq_printf(s, "P "); | ||
433 | break; | ||
434 | default: | ||
435 | ret = trace_seq_printf(s, "? "); | ||
436 | } | ||
437 | |||
438 | if (!ret) | ||
439 | return TRACE_TYPE_PARTIAL_LINE; | ||
440 | |||
441 | /* Skip requested/allocated/flags */ | ||
442 | ret = trace_seq_printf(s, " "); | ||
443 | if (!ret) | ||
444 | return TRACE_TYPE_PARTIAL_LINE; | ||
445 | |||
446 | /* Pointer to allocated */ | ||
447 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
448 | if (!ret) | ||
449 | return TRACE_TYPE_PARTIAL_LINE; | ||
450 | |||
451 | /* Skip node and print call site*/ | ||
452 | ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); | ||
453 | if (!ret) | ||
454 | return TRACE_TYPE_PARTIAL_LINE; | ||
455 | |||
456 | return TRACE_TYPE_HANDLED; | ||
457 | } | ||
458 | |||
459 | static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) | ||
460 | { | ||
461 | struct trace_entry *entry = iter->ent; | ||
462 | |||
463 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
464 | return TRACE_TYPE_UNHANDLED; | ||
465 | |||
466 | switch (entry->type) { | ||
467 | case TRACE_KMEM_ALLOC: | ||
468 | return kmemtrace_print_alloc_compress(iter); | ||
469 | case TRACE_KMEM_FREE: | ||
470 | return kmemtrace_print_free_compress(iter); | ||
471 | default: | ||
472 | return TRACE_TYPE_UNHANDLED; | ||
473 | } | ||
474 | } | ||
475 | |||
476 | static struct trace_event_functions kmem_trace_alloc_funcs = { | ||
477 | .trace = kmemtrace_print_alloc, | ||
478 | .binary = kmemtrace_print_alloc_user, | ||
479 | }; | ||
480 | |||
481 | static struct trace_event kmem_trace_alloc = { | ||
482 | .type = TRACE_KMEM_ALLOC, | ||
483 | .funcs = &kmem_trace_alloc_funcs, | ||
484 | }; | ||
485 | |||
486 | static struct trace_event_functions kmem_trace_free_funcs = { | ||
487 | .trace = kmemtrace_print_free, | ||
488 | .binary = kmemtrace_print_free_user, | ||
489 | }; | ||
490 | |||
491 | static struct trace_event kmem_trace_free = { | ||
492 | .type = TRACE_KMEM_FREE, | ||
493 | .funcs = &kmem_trace_free_funcs, | ||
494 | }; | ||
495 | |||
496 | static struct tracer kmem_tracer __read_mostly = { | ||
497 | .name = "kmemtrace", | ||
498 | .init = kmem_trace_init, | ||
499 | .reset = kmem_trace_reset, | ||
500 | .print_line = kmemtrace_print_line, | ||
501 | .print_header = kmemtrace_headers, | ||
502 | .flags = &kmem_tracer_flags | ||
503 | }; | ||
504 | |||
505 | void kmemtrace_init(void) | ||
506 | { | ||
507 | /* earliest opportunity to start kmem tracing */ | ||
508 | } | ||
509 | |||
510 | static int __init init_kmem_tracer(void) | ||
511 | { | ||
512 | if (!register_ftrace_event(&kmem_trace_alloc)) { | ||
513 | pr_warning("Warning: could not register kmem events\n"); | ||
514 | return 1; | ||
515 | } | ||
516 | |||
517 | if (!register_ftrace_event(&kmem_trace_free)) { | ||
518 | pr_warning("Warning: could not register kmem events\n"); | ||
519 | return 1; | ||
520 | } | ||
521 | |||
522 | if (register_tracer(&kmem_tracer) != 0) { | ||
523 | pr_warning("Warning: could not register the kmem tracer\n"); | ||
524 | return 1; | ||
525 | } | ||
526 | |||
527 | return 0; | ||
528 | } | ||
529 | device_initcall(init_kmem_tracer); | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b85..492197e2f86c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) | |||
443 | */ | 443 | */ |
444 | struct ring_buffer_per_cpu { | 444 | struct ring_buffer_per_cpu { |
445 | int cpu; | 445 | int cpu; |
446 | atomic_t record_disabled; | ||
446 | struct ring_buffer *buffer; | 447 | struct ring_buffer *buffer; |
447 | spinlock_t reader_lock; /* serialize readers */ | 448 | spinlock_t reader_lock; /* serialize readers */ |
448 | arch_spinlock_t lock; | 449 | arch_spinlock_t lock; |
@@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { | |||
462 | unsigned long read; | 463 | unsigned long read; |
463 | u64 write_stamp; | 464 | u64 write_stamp; |
464 | u64 read_stamp; | 465 | u64 read_stamp; |
465 | atomic_t record_disabled; | ||
466 | }; | 466 | }; |
467 | 467 | ||
468 | struct ring_buffer { | 468 | struct ring_buffer { |
@@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void) | |||
2242 | 2242 | ||
2243 | #endif | 2243 | #endif |
2244 | 2244 | ||
2245 | static DEFINE_PER_CPU(int, rb_need_resched); | ||
2246 | |||
2247 | /** | 2245 | /** |
2248 | * ring_buffer_lock_reserve - reserve a part of the buffer | 2246 | * ring_buffer_lock_reserve - reserve a part of the buffer |
2249 | * @buffer: the ring buffer to reserve from | 2247 | * @buffer: the ring buffer to reserve from |
@@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2264 | { | 2262 | { |
2265 | struct ring_buffer_per_cpu *cpu_buffer; | 2263 | struct ring_buffer_per_cpu *cpu_buffer; |
2266 | struct ring_buffer_event *event; | 2264 | struct ring_buffer_event *event; |
2267 | int cpu, resched; | 2265 | int cpu; |
2268 | 2266 | ||
2269 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2267 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2270 | return NULL; | 2268 | return NULL; |
2271 | 2269 | ||
2272 | /* If we are tracing schedule, we don't want to recurse */ | 2270 | /* If we are tracing schedule, we don't want to recurse */ |
2273 | resched = ftrace_preempt_disable(); | 2271 | preempt_disable_notrace(); |
2274 | 2272 | ||
2275 | if (atomic_read(&buffer->record_disabled)) | 2273 | if (atomic_read(&buffer->record_disabled)) |
2276 | goto out_nocheck; | 2274 | goto out_nocheck; |
@@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
2295 | if (!event) | 2293 | if (!event) |
2296 | goto out; | 2294 | goto out; |
2297 | 2295 | ||
2298 | /* | ||
2299 | * Need to store resched state on this cpu. | ||
2300 | * Only the first needs to. | ||
2301 | */ | ||
2302 | |||
2303 | if (preempt_count() == 1) | ||
2304 | per_cpu(rb_need_resched, cpu) = resched; | ||
2305 | |||
2306 | return event; | 2296 | return event; |
2307 | 2297 | ||
2308 | out: | 2298 | out: |
2309 | trace_recursive_unlock(); | 2299 | trace_recursive_unlock(); |
2310 | 2300 | ||
2311 | out_nocheck: | 2301 | out_nocheck: |
2312 | ftrace_preempt_enable(resched); | 2302 | preempt_enable_notrace(); |
2313 | return NULL; | 2303 | return NULL; |
2314 | } | 2304 | } |
2315 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); | 2305 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); |
@@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
2355 | 2345 | ||
2356 | trace_recursive_unlock(); | 2346 | trace_recursive_unlock(); |
2357 | 2347 | ||
2358 | /* | 2348 | preempt_enable_notrace(); |
2359 | * Only the last preempt count needs to restore preemption. | ||
2360 | */ | ||
2361 | if (preempt_count() == 1) | ||
2362 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2363 | else | ||
2364 | preempt_enable_no_resched_notrace(); | ||
2365 | 2349 | ||
2366 | return 0; | 2350 | return 0; |
2367 | } | 2351 | } |
@@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, | |||
2469 | 2453 | ||
2470 | trace_recursive_unlock(); | 2454 | trace_recursive_unlock(); |
2471 | 2455 | ||
2472 | /* | 2456 | preempt_enable_notrace(); |
2473 | * Only the last preempt count needs to restore preemption. | ||
2474 | */ | ||
2475 | if (preempt_count() == 1) | ||
2476 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
2477 | else | ||
2478 | preempt_enable_no_resched_notrace(); | ||
2479 | 2457 | ||
2480 | } | 2458 | } |
2481 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | 2459 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); |
@@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2501 | struct ring_buffer_event *event; | 2479 | struct ring_buffer_event *event; |
2502 | void *body; | 2480 | void *body; |
2503 | int ret = -EBUSY; | 2481 | int ret = -EBUSY; |
2504 | int cpu, resched; | 2482 | int cpu; |
2505 | 2483 | ||
2506 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2484 | if (ring_buffer_flags != RB_BUFFERS_ON) |
2507 | return -EBUSY; | 2485 | return -EBUSY; |
2508 | 2486 | ||
2509 | resched = ftrace_preempt_disable(); | 2487 | preempt_disable_notrace(); |
2510 | 2488 | ||
2511 | if (atomic_read(&buffer->record_disabled)) | 2489 | if (atomic_read(&buffer->record_disabled)) |
2512 | goto out; | 2490 | goto out; |
@@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
2536 | 2514 | ||
2537 | ret = 0; | 2515 | ret = 0; |
2538 | out: | 2516 | out: |
2539 | ftrace_preempt_enable(resched); | 2517 | preempt_enable_notrace(); |
2540 | 2518 | ||
2541 | return ret; | 2519 | return ret; |
2542 | } | 2520 | } |
@@ -3007,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) | |||
3007 | 2985 | ||
3008 | static void rb_advance_iter(struct ring_buffer_iter *iter) | 2986 | static void rb_advance_iter(struct ring_buffer_iter *iter) |
3009 | { | 2987 | { |
3010 | struct ring_buffer *buffer; | ||
3011 | struct ring_buffer_per_cpu *cpu_buffer; | 2988 | struct ring_buffer_per_cpu *cpu_buffer; |
3012 | struct ring_buffer_event *event; | 2989 | struct ring_buffer_event *event; |
3013 | unsigned length; | 2990 | unsigned length; |
3014 | 2991 | ||
3015 | cpu_buffer = iter->cpu_buffer; | 2992 | cpu_buffer = iter->cpu_buffer; |
3016 | buffer = cpu_buffer->buffer; | ||
3017 | 2993 | ||
3018 | /* | 2994 | /* |
3019 | * Check if we are at the end of the buffer. | 2995 | * Check if we are at the end of the buffer. |
@@ -3868,6 +3844,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3868 | rpos = reader->read; | 3844 | rpos = reader->read; |
3869 | pos += size; | 3845 | pos += size; |
3870 | 3846 | ||
3847 | if (rpos >= commit) | ||
3848 | break; | ||
3849 | |||
3871 | event = rb_reader_event(cpu_buffer); | 3850 | event = rb_reader_event(cpu_buffer); |
3872 | size = rb_event_length(event); | 3851 | size = rb_event_length(event); |
3873 | } while (len > size); | 3852 | } while (len > size); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d36316805..9ec59f541156 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) | |||
101 | preempt_enable(); | 101 | preempt_enable(); |
102 | } | 102 | } |
103 | 103 | ||
104 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | cpumask_var_t __read_mostly tracing_buffer_mask; |
105 | |||
106 | #define for_each_tracing_cpu(cpu) \ | ||
107 | for_each_cpu(cpu, tracing_buffer_mask) | ||
108 | 105 | ||
109 | /* | 106 | /* |
110 | * ftrace_dump_on_oops - variable to dump ftrace buffer on oops | 107 | * ftrace_dump_on_oops - variable to dump ftrace buffer on oops |
@@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
344 | /* trace_flags holds trace_options default values */ | 341 | /* trace_flags holds trace_options default values */ |
345 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 342 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
346 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 343 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
347 | TRACE_ITER_GRAPH_TIME; | 344 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; |
348 | 345 | ||
349 | static int trace_stop_count; | 346 | static int trace_stop_count; |
350 | static DEFINE_SPINLOCK(tracing_start_lock); | 347 | static DEFINE_SPINLOCK(tracing_start_lock); |
@@ -428,6 +425,7 @@ static const char *trace_options[] = { | |||
428 | "latency-format", | 425 | "latency-format", |
429 | "sleep-time", | 426 | "sleep-time", |
430 | "graph-time", | 427 | "graph-time", |
428 | "record-cmd", | ||
431 | NULL | 429 | NULL |
432 | }; | 430 | }; |
433 | 431 | ||
@@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
659 | return; | 657 | return; |
660 | 658 | ||
661 | WARN_ON_ONCE(!irqs_disabled()); | 659 | WARN_ON_ONCE(!irqs_disabled()); |
660 | if (!current_trace->use_max_tr) { | ||
661 | WARN_ON_ONCE(1); | ||
662 | return; | ||
663 | } | ||
662 | arch_spin_lock(&ftrace_max_lock); | 664 | arch_spin_lock(&ftrace_max_lock); |
663 | 665 | ||
664 | tr->buffer = max_tr.buffer; | 666 | tr->buffer = max_tr.buffer; |
@@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
685 | return; | 687 | return; |
686 | 688 | ||
687 | WARN_ON_ONCE(!irqs_disabled()); | 689 | WARN_ON_ONCE(!irqs_disabled()); |
690 | if (!current_trace->use_max_tr) { | ||
691 | WARN_ON_ONCE(1); | ||
692 | return; | ||
693 | } | ||
694 | |||
688 | arch_spin_lock(&ftrace_max_lock); | 695 | arch_spin_lock(&ftrace_max_lock); |
689 | 696 | ||
690 | ftrace_disable_cpu(); | 697 | ftrace_disable_cpu(); |
@@ -729,18 +736,11 @@ __acquires(kernel_lock) | |||
729 | return -1; | 736 | return -1; |
730 | } | 737 | } |
731 | 738 | ||
732 | if (strlen(type->name) > MAX_TRACER_SIZE) { | 739 | if (strlen(type->name) >= MAX_TRACER_SIZE) { |
733 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); | 740 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); |
734 | return -1; | 741 | return -1; |
735 | } | 742 | } |
736 | 743 | ||
737 | /* | ||
738 | * When this gets called we hold the BKL which means that | ||
739 | * preemption is disabled. Various trace selftests however | ||
740 | * need to disable and enable preemption for successful tests. | ||
741 | * So we drop the BKL here and grab it after the tests again. | ||
742 | */ | ||
743 | unlock_kernel(); | ||
744 | mutex_lock(&trace_types_lock); | 744 | mutex_lock(&trace_types_lock); |
745 | 745 | ||
746 | tracing_selftest_running = true; | 746 | tracing_selftest_running = true; |
@@ -822,7 +822,6 @@ __acquires(kernel_lock) | |||
822 | #endif | 822 | #endif |
823 | 823 | ||
824 | out_unlock: | 824 | out_unlock: |
825 | lock_kernel(); | ||
826 | return ret; | 825 | return ret; |
827 | } | 826 | } |
828 | 827 | ||
@@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
1331 | 1330 | ||
1332 | #endif /* CONFIG_STACKTRACE */ | 1331 | #endif /* CONFIG_STACKTRACE */ |
1333 | 1332 | ||
1334 | static void | ||
1335 | ftrace_trace_special(void *__tr, | ||
1336 | unsigned long arg1, unsigned long arg2, unsigned long arg3, | ||
1337 | int pc) | ||
1338 | { | ||
1339 | struct ftrace_event_call *call = &event_special; | ||
1340 | struct ring_buffer_event *event; | ||
1341 | struct trace_array *tr = __tr; | ||
1342 | struct ring_buffer *buffer = tr->buffer; | ||
1343 | struct special_entry *entry; | ||
1344 | |||
1345 | event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, | ||
1346 | sizeof(*entry), 0, pc); | ||
1347 | if (!event) | ||
1348 | return; | ||
1349 | entry = ring_buffer_event_data(event); | ||
1350 | entry->arg1 = arg1; | ||
1351 | entry->arg2 = arg2; | ||
1352 | entry->arg3 = arg3; | ||
1353 | |||
1354 | if (!filter_check_discard(call, entry, buffer, event)) | ||
1355 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
1356 | } | ||
1357 | |||
1358 | void | ||
1359 | __trace_special(void *__tr, void *__data, | ||
1360 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
1361 | { | ||
1362 | ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); | ||
1363 | } | ||
1364 | |||
1365 | void | ||
1366 | ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
1367 | { | ||
1368 | struct trace_array *tr = &global_trace; | ||
1369 | struct trace_array_cpu *data; | ||
1370 | unsigned long flags; | ||
1371 | int cpu; | ||
1372 | int pc; | ||
1373 | |||
1374 | if (tracing_disabled) | ||
1375 | return; | ||
1376 | |||
1377 | pc = preempt_count(); | ||
1378 | local_irq_save(flags); | ||
1379 | cpu = raw_smp_processor_id(); | ||
1380 | data = tr->data[cpu]; | ||
1381 | |||
1382 | if (likely(atomic_inc_return(&data->disabled) == 1)) | ||
1383 | ftrace_trace_special(tr, arg1, arg2, arg3, pc); | ||
1384 | |||
1385 | atomic_dec(&data->disabled); | ||
1386 | local_irq_restore(flags); | ||
1387 | } | ||
1388 | |||
1389 | /** | 1333 | /** |
1390 | * trace_vbprintk - write binary msg to tracing buffer | 1334 | * trace_vbprintk - write binary msg to tracing buffer |
1391 | * | 1335 | * |
@@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1404 | struct bprint_entry *entry; | 1348 | struct bprint_entry *entry; |
1405 | unsigned long flags; | 1349 | unsigned long flags; |
1406 | int disable; | 1350 | int disable; |
1407 | int resched; | ||
1408 | int cpu, len = 0, size, pc; | 1351 | int cpu, len = 0, size, pc; |
1409 | 1352 | ||
1410 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1353 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
@@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
1414 | pause_graph_tracing(); | 1357 | pause_graph_tracing(); |
1415 | 1358 | ||
1416 | pc = preempt_count(); | 1359 | pc = preempt_count(); |
1417 | resched = ftrace_preempt_disable(); | 1360 | preempt_disable_notrace(); |
1418 | cpu = raw_smp_processor_id(); | 1361 | cpu = raw_smp_processor_id(); |
1419 | data = tr->data[cpu]; | 1362 | data = tr->data[cpu]; |
1420 | 1363 | ||
@@ -1452,7 +1395,7 @@ out_unlock: | |||
1452 | 1395 | ||
1453 | out: | 1396 | out: |
1454 | atomic_dec_return(&data->disabled); | 1397 | atomic_dec_return(&data->disabled); |
1455 | ftrace_preempt_enable(resched); | 1398 | preempt_enable_notrace(); |
1456 | unpause_graph_tracing(); | 1399 | unpause_graph_tracing(); |
1457 | 1400 | ||
1458 | return len; | 1401 | return len; |
@@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) | |||
1539 | } | 1482 | } |
1540 | EXPORT_SYMBOL_GPL(trace_vprintk); | 1483 | EXPORT_SYMBOL_GPL(trace_vprintk); |
1541 | 1484 | ||
1542 | enum trace_file_type { | ||
1543 | TRACE_FILE_LAT_FMT = 1, | ||
1544 | TRACE_FILE_ANNOTATE = 2, | ||
1545 | }; | ||
1546 | |||
1547 | static void trace_iterator_increment(struct trace_iterator *iter) | 1485 | static void trace_iterator_increment(struct trace_iterator *iter) |
1548 | { | 1486 | { |
1549 | /* Don't allow ftrace to trace into the ring buffers */ | 1487 | /* Don't allow ftrace to trace into the ring buffers */ |
@@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | |||
1641 | } | 1579 | } |
1642 | 1580 | ||
1643 | /* Find the next real entry, and increment the iterator to the next entry */ | 1581 | /* Find the next real entry, and increment the iterator to the next entry */ |
1644 | static void *find_next_entry_inc(struct trace_iterator *iter) | 1582 | void *trace_find_next_entry_inc(struct trace_iterator *iter) |
1645 | { | 1583 | { |
1646 | iter->ent = __find_next_entry(iter, &iter->cpu, | 1584 | iter->ent = __find_next_entry(iter, &iter->cpu, |
1647 | &iter->lost_events, &iter->ts); | 1585 | &iter->lost_events, &iter->ts); |
@@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) | |||
1676 | return NULL; | 1614 | return NULL; |
1677 | 1615 | ||
1678 | if (iter->idx < 0) | 1616 | if (iter->idx < 0) |
1679 | ent = find_next_entry_inc(iter); | 1617 | ent = trace_find_next_entry_inc(iter); |
1680 | else | 1618 | else |
1681 | ent = iter; | 1619 | ent = iter; |
1682 | 1620 | ||
1683 | while (ent && iter->idx < i) | 1621 | while (ent && iter->idx < i) |
1684 | ent = find_next_entry_inc(iter); | 1622 | ent = trace_find_next_entry_inc(iter); |
1685 | 1623 | ||
1686 | iter->pos = *pos; | 1624 | iter->pos = *pos; |
1687 | 1625 | ||
1688 | return ent; | 1626 | return ent; |
1689 | } | 1627 | } |
1690 | 1628 | ||
1691 | static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | 1629 | void tracing_iter_reset(struct trace_iterator *iter, int cpu) |
1692 | { | 1630 | { |
1693 | struct trace_array *tr = iter->tr; | 1631 | struct trace_array *tr = iter->tr; |
1694 | struct ring_buffer_event *event; | 1632 | struct ring_buffer_event *event; |
@@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) | |||
2049 | } | 1987 | } |
2050 | 1988 | ||
2051 | /* Called with trace_event_read_lock() held. */ | 1989 | /* Called with trace_event_read_lock() held. */ |
2052 | static enum print_line_t print_trace_line(struct trace_iterator *iter) | 1990 | enum print_line_t print_trace_line(struct trace_iterator *iter) |
2053 | { | 1991 | { |
2054 | enum print_line_t ret; | 1992 | enum print_line_t ret; |
2055 | 1993 | ||
@@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = { | |||
2394 | .open = show_traces_open, | 2332 | .open = show_traces_open, |
2395 | .read = seq_read, | 2333 | .read = seq_read, |
2396 | .release = seq_release, | 2334 | .release = seq_release, |
2335 | .llseek = seq_lseek, | ||
2397 | }; | 2336 | }; |
2398 | 2337 | ||
2399 | /* | 2338 | /* |
@@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { | |||
2487 | .open = tracing_open_generic, | 2426 | .open = tracing_open_generic, |
2488 | .read = tracing_cpumask_read, | 2427 | .read = tracing_cpumask_read, |
2489 | .write = tracing_cpumask_write, | 2428 | .write = tracing_cpumask_write, |
2429 | .llseek = generic_file_llseek, | ||
2490 | }; | 2430 | }; |
2491 | 2431 | ||
2492 | static int tracing_trace_options_show(struct seq_file *m, void *v) | 2432 | static int tracing_trace_options_show(struct seq_file *m, void *v) |
@@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
2562 | trace_flags |= mask; | 2502 | trace_flags |= mask; |
2563 | else | 2503 | else |
2564 | trace_flags &= ~mask; | 2504 | trace_flags &= ~mask; |
2505 | |||
2506 | if (mask == TRACE_ITER_RECORD_CMD) | ||
2507 | trace_event_enable_cmd_record(enabled); | ||
2565 | } | 2508 | } |
2566 | 2509 | ||
2567 | static ssize_t | 2510 | static ssize_t |
@@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, | |||
2653 | static const struct file_operations tracing_readme_fops = { | 2596 | static const struct file_operations tracing_readme_fops = { |
2654 | .open = tracing_open_generic, | 2597 | .open = tracing_open_generic, |
2655 | .read = tracing_readme_read, | 2598 | .read = tracing_readme_read, |
2599 | .llseek = generic_file_llseek, | ||
2656 | }; | 2600 | }; |
2657 | 2601 | ||
2658 | static ssize_t | 2602 | static ssize_t |
@@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, | |||
2703 | static const struct file_operations tracing_saved_cmdlines_fops = { | 2647 | static const struct file_operations tracing_saved_cmdlines_fops = { |
2704 | .open = tracing_open_generic, | 2648 | .open = tracing_open_generic, |
2705 | .read = tracing_saved_cmdlines_read, | 2649 | .read = tracing_saved_cmdlines_read, |
2650 | .llseek = generic_file_llseek, | ||
2706 | }; | 2651 | }; |
2707 | 2652 | ||
2708 | static ssize_t | 2653 | static ssize_t |
@@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
2798 | if (ret < 0) | 2743 | if (ret < 0) |
2799 | return ret; | 2744 | return ret; |
2800 | 2745 | ||
2746 | if (!current_trace->use_max_tr) | ||
2747 | goto out; | ||
2748 | |||
2801 | ret = ring_buffer_resize(max_tr.buffer, size); | 2749 | ret = ring_buffer_resize(max_tr.buffer, size); |
2802 | if (ret < 0) { | 2750 | if (ret < 0) { |
2803 | int r; | 2751 | int r; |
@@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
2825 | return ret; | 2773 | return ret; |
2826 | } | 2774 | } |
2827 | 2775 | ||
2776 | max_tr.entries = size; | ||
2777 | out: | ||
2828 | global_trace.entries = size; | 2778 | global_trace.entries = size; |
2829 | 2779 | ||
2830 | return ret; | 2780 | return ret; |
2831 | } | 2781 | } |
2832 | 2782 | ||
2783 | |||
2833 | /** | 2784 | /** |
2834 | * tracing_update_buffers - used by tracing facility to expand ring buffers | 2785 | * tracing_update_buffers - used by tracing facility to expand ring buffers |
2835 | * | 2786 | * |
@@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) | |||
2890 | trace_branch_disable(); | 2841 | trace_branch_disable(); |
2891 | if (current_trace && current_trace->reset) | 2842 | if (current_trace && current_trace->reset) |
2892 | current_trace->reset(tr); | 2843 | current_trace->reset(tr); |
2893 | 2844 | if (current_trace && current_trace->use_max_tr) { | |
2845 | /* | ||
2846 | * We don't free the ring buffer. instead, resize it because | ||
2847 | * The max_tr ring buffer has some state (e.g. ring->clock) and | ||
2848 | * we want preserve it. | ||
2849 | */ | ||
2850 | ring_buffer_resize(max_tr.buffer, 1); | ||
2851 | max_tr.entries = 1; | ||
2852 | } | ||
2894 | destroy_trace_option_files(topts); | 2853 | destroy_trace_option_files(topts); |
2895 | 2854 | ||
2896 | current_trace = t; | 2855 | current_trace = t; |
2897 | 2856 | ||
2898 | topts = create_trace_option_files(current_trace); | 2857 | topts = create_trace_option_files(current_trace); |
2858 | if (current_trace->use_max_tr) { | ||
2859 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | ||
2860 | if (ret < 0) | ||
2861 | goto out; | ||
2862 | max_tr.entries = global_trace.entries; | ||
2863 | } | ||
2899 | 2864 | ||
2900 | if (t->init) { | 2865 | if (t->init) { |
2901 | ret = tracer_init(t, tr); | 2866 | ret = tracer_init(t, tr); |
@@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3032 | if (iter->trace->pipe_open) | 2997 | if (iter->trace->pipe_open) |
3033 | iter->trace->pipe_open(iter); | 2998 | iter->trace->pipe_open(iter); |
3034 | 2999 | ||
3000 | nonseekable_open(inode, filp); | ||
3035 | out: | 3001 | out: |
3036 | mutex_unlock(&trace_types_lock); | 3002 | mutex_unlock(&trace_types_lock); |
3037 | return ret; | 3003 | return ret; |
@@ -3211,7 +3177,7 @@ waitagain: | |||
3211 | 3177 | ||
3212 | trace_event_read_lock(); | 3178 | trace_event_read_lock(); |
3213 | trace_access_lock(iter->cpu_file); | 3179 | trace_access_lock(iter->cpu_file); |
3214 | while (find_next_entry_inc(iter) != NULL) { | 3180 | while (trace_find_next_entry_inc(iter) != NULL) { |
3215 | enum print_line_t ret; | 3181 | enum print_line_t ret; |
3216 | int len = iter->seq.len; | 3182 | int len = iter->seq.len; |
3217 | 3183 | ||
@@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) | |||
3294 | if (ret != TRACE_TYPE_NO_CONSUME) | 3260 | if (ret != TRACE_TYPE_NO_CONSUME) |
3295 | trace_consume(iter); | 3261 | trace_consume(iter); |
3296 | rem -= count; | 3262 | rem -= count; |
3297 | if (!find_next_entry_inc(iter)) { | 3263 | if (!trace_find_next_entry_inc(iter)) { |
3298 | rem = 0; | 3264 | rem = 0; |
3299 | iter->ent = NULL; | 3265 | iter->ent = NULL; |
3300 | break; | 3266 | break; |
@@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3350 | if (ret <= 0) | 3316 | if (ret <= 0) |
3351 | goto out_err; | 3317 | goto out_err; |
3352 | 3318 | ||
3353 | if (!iter->ent && !find_next_entry_inc(iter)) { | 3319 | if (!iter->ent && !trace_find_next_entry_inc(iter)) { |
3354 | ret = -EFAULT; | 3320 | ret = -EFAULT; |
3355 | goto out_err; | 3321 | goto out_err; |
3356 | } | 3322 | } |
@@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3477 | } | 3443 | } |
3478 | 3444 | ||
3479 | tracing_start(); | 3445 | tracing_start(); |
3480 | max_tr.entries = global_trace.entries; | ||
3481 | mutex_unlock(&trace_types_lock); | 3446 | mutex_unlock(&trace_types_lock); |
3482 | 3447 | ||
3483 | return cnt; | 3448 | return cnt; |
@@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3498 | size_t cnt, loff_t *fpos) | 3463 | size_t cnt, loff_t *fpos) |
3499 | { | 3464 | { |
3500 | char *buf; | 3465 | char *buf; |
3466 | size_t written; | ||
3501 | 3467 | ||
3502 | if (tracing_disabled) | 3468 | if (tracing_disabled) |
3503 | return -EINVAL; | 3469 | return -EINVAL; |
@@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3519 | } else | 3485 | } else |
3520 | buf[cnt] = '\0'; | 3486 | buf[cnt] = '\0'; |
3521 | 3487 | ||
3522 | cnt = mark_printk("%s", buf); | 3488 | written = mark_printk("%s", buf); |
3523 | kfree(buf); | 3489 | kfree(buf); |
3524 | *fpos += cnt; | 3490 | *fpos += written; |
3525 | 3491 | ||
3526 | return cnt; | 3492 | /* don't tell userspace we wrote more - it might confuse them */ |
3493 | if (written > cnt) | ||
3494 | written = cnt; | ||
3495 | |||
3496 | return written; | ||
3527 | } | 3497 | } |
3528 | 3498 | ||
3529 | static int tracing_clock_show(struct seq_file *m, void *v) | 3499 | static int tracing_clock_show(struct seq_file *m, void *v) |
@@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = { | |||
3590 | .open = tracing_open_generic, | 3560 | .open = tracing_open_generic, |
3591 | .read = tracing_max_lat_read, | 3561 | .read = tracing_max_lat_read, |
3592 | .write = tracing_max_lat_write, | 3562 | .write = tracing_max_lat_write, |
3563 | .llseek = generic_file_llseek, | ||
3593 | }; | 3564 | }; |
3594 | 3565 | ||
3595 | static const struct file_operations tracing_ctrl_fops = { | 3566 | static const struct file_operations tracing_ctrl_fops = { |
3596 | .open = tracing_open_generic, | 3567 | .open = tracing_open_generic, |
3597 | .read = tracing_ctrl_read, | 3568 | .read = tracing_ctrl_read, |
3598 | .write = tracing_ctrl_write, | 3569 | .write = tracing_ctrl_write, |
3570 | .llseek = generic_file_llseek, | ||
3599 | }; | 3571 | }; |
3600 | 3572 | ||
3601 | static const struct file_operations set_tracer_fops = { | 3573 | static const struct file_operations set_tracer_fops = { |
3602 | .open = tracing_open_generic, | 3574 | .open = tracing_open_generic, |
3603 | .read = tracing_set_trace_read, | 3575 | .read = tracing_set_trace_read, |
3604 | .write = tracing_set_trace_write, | 3576 | .write = tracing_set_trace_write, |
3577 | .llseek = generic_file_llseek, | ||
3605 | }; | 3578 | }; |
3606 | 3579 | ||
3607 | static const struct file_operations tracing_pipe_fops = { | 3580 | static const struct file_operations tracing_pipe_fops = { |
@@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = { | |||
3610 | .read = tracing_read_pipe, | 3583 | .read = tracing_read_pipe, |
3611 | .splice_read = tracing_splice_read_pipe, | 3584 | .splice_read = tracing_splice_read_pipe, |
3612 | .release = tracing_release_pipe, | 3585 | .release = tracing_release_pipe, |
3586 | .llseek = no_llseek, | ||
3613 | }; | 3587 | }; |
3614 | 3588 | ||
3615 | static const struct file_operations tracing_entries_fops = { | 3589 | static const struct file_operations tracing_entries_fops = { |
3616 | .open = tracing_open_generic, | 3590 | .open = tracing_open_generic, |
3617 | .read = tracing_entries_read, | 3591 | .read = tracing_entries_read, |
3618 | .write = tracing_entries_write, | 3592 | .write = tracing_entries_write, |
3593 | .llseek = generic_file_llseek, | ||
3619 | }; | 3594 | }; |
3620 | 3595 | ||
3621 | static const struct file_operations tracing_mark_fops = { | 3596 | static const struct file_operations tracing_mark_fops = { |
3622 | .open = tracing_open_generic, | 3597 | .open = tracing_open_generic, |
3623 | .write = tracing_mark_write, | 3598 | .write = tracing_mark_write, |
3599 | .llseek = generic_file_llseek, | ||
3624 | }; | 3600 | }; |
3625 | 3601 | ||
3626 | static const struct file_operations trace_clock_fops = { | 3602 | static const struct file_operations trace_clock_fops = { |
@@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
3926 | static const struct file_operations tracing_stats_fops = { | 3902 | static const struct file_operations tracing_stats_fops = { |
3927 | .open = tracing_open_generic, | 3903 | .open = tracing_open_generic, |
3928 | .read = tracing_stats_read, | 3904 | .read = tracing_stats_read, |
3905 | .llseek = generic_file_llseek, | ||
3929 | }; | 3906 | }; |
3930 | 3907 | ||
3931 | #ifdef CONFIG_DYNAMIC_FTRACE | 3908 | #ifdef CONFIG_DYNAMIC_FTRACE |
@@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, | |||
3962 | static const struct file_operations tracing_dyn_info_fops = { | 3939 | static const struct file_operations tracing_dyn_info_fops = { |
3963 | .open = tracing_open_generic, | 3940 | .open = tracing_open_generic, |
3964 | .read = tracing_read_dyn_info, | 3941 | .read = tracing_read_dyn_info, |
3942 | .llseek = generic_file_llseek, | ||
3965 | }; | 3943 | }; |
3966 | #endif | 3944 | #endif |
3967 | 3945 | ||
@@ -4115,6 +4093,7 @@ static const struct file_operations trace_options_fops = { | |||
4115 | .open = tracing_open_generic, | 4093 | .open = tracing_open_generic, |
4116 | .read = trace_options_read, | 4094 | .read = trace_options_read, |
4117 | .write = trace_options_write, | 4095 | .write = trace_options_write, |
4096 | .llseek = generic_file_llseek, | ||
4118 | }; | 4097 | }; |
4119 | 4098 | ||
4120 | static ssize_t | 4099 | static ssize_t |
@@ -4166,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = { | |||
4166 | .open = tracing_open_generic, | 4145 | .open = tracing_open_generic, |
4167 | .read = trace_options_core_read, | 4146 | .read = trace_options_core_read, |
4168 | .write = trace_options_core_write, | 4147 | .write = trace_options_core_write, |
4148 | .llseek = generic_file_llseek, | ||
4169 | }; | 4149 | }; |
4170 | 4150 | ||
4171 | struct dentry *trace_create_file(const char *name, | 4151 | struct dentry *trace_create_file(const char *name, |
@@ -4355,9 +4335,6 @@ static __init int tracer_init_debugfs(void) | |||
4355 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4335 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
4356 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 4336 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
4357 | #endif | 4337 | #endif |
4358 | #ifdef CONFIG_SYSPROF_TRACER | ||
4359 | init_tracer_sysprof_debugfs(d_tracer); | ||
4360 | #endif | ||
4361 | 4338 | ||
4362 | create_trace_options_dir(); | 4339 | create_trace_options_dir(); |
4363 | 4340 | ||
@@ -4414,7 +4391,7 @@ static struct notifier_block trace_die_notifier = { | |||
4414 | */ | 4391 | */ |
4415 | #define KERN_TRACE KERN_EMERG | 4392 | #define KERN_TRACE KERN_EMERG |
4416 | 4393 | ||
4417 | static void | 4394 | void |
4418 | trace_printk_seq(struct trace_seq *s) | 4395 | trace_printk_seq(struct trace_seq *s) |
4419 | { | 4396 | { |
4420 | /* Probably should print a warning here. */ | 4397 | /* Probably should print a warning here. */ |
@@ -4429,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s) | |||
4429 | trace_seq_init(s); | 4406 | trace_seq_init(s); |
4430 | } | 4407 | } |
4431 | 4408 | ||
4409 | void trace_init_global_iter(struct trace_iterator *iter) | ||
4410 | { | ||
4411 | iter->tr = &global_trace; | ||
4412 | iter->trace = current_trace; | ||
4413 | iter->cpu_file = TRACE_PIPE_ALL_CPU; | ||
4414 | } | ||
4415 | |||
4432 | static void | 4416 | static void |
4433 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | 4417 | __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) |
4434 | { | 4418 | { |
@@ -4454,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4454 | if (disable_tracing) | 4438 | if (disable_tracing) |
4455 | ftrace_kill(); | 4439 | ftrace_kill(); |
4456 | 4440 | ||
4441 | trace_init_global_iter(&iter); | ||
4442 | |||
4457 | for_each_tracing_cpu(cpu) { | 4443 | for_each_tracing_cpu(cpu) { |
4458 | atomic_inc(&global_trace.data[cpu]->disabled); | 4444 | atomic_inc(&iter.tr->data[cpu]->disabled); |
4459 | } | 4445 | } |
4460 | 4446 | ||
4461 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; | 4447 | old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; |
@@ -4504,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4504 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | 4490 | iter.iter_flags |= TRACE_FILE_LAT_FMT; |
4505 | iter.pos = -1; | 4491 | iter.pos = -1; |
4506 | 4492 | ||
4507 | if (find_next_entry_inc(&iter) != NULL) { | 4493 | if (trace_find_next_entry_inc(&iter) != NULL) { |
4508 | int ret; | 4494 | int ret; |
4509 | 4495 | ||
4510 | ret = print_trace_line(&iter); | 4496 | ret = print_trace_line(&iter); |
@@ -4526,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4526 | trace_flags |= old_userobj; | 4512 | trace_flags |= old_userobj; |
4527 | 4513 | ||
4528 | for_each_tracing_cpu(cpu) { | 4514 | for_each_tracing_cpu(cpu) { |
4529 | atomic_dec(&global_trace.data[cpu]->disabled); | 4515 | atomic_dec(&iter.tr->data[cpu]->disabled); |
4530 | } | 4516 | } |
4531 | tracing_on(); | 4517 | tracing_on(); |
4532 | } | 4518 | } |
@@ -4575,16 +4561,14 @@ __init static int tracer_alloc_buffers(void) | |||
4575 | 4561 | ||
4576 | 4562 | ||
4577 | #ifdef CONFIG_TRACER_MAX_TRACE | 4563 | #ifdef CONFIG_TRACER_MAX_TRACE |
4578 | max_tr.buffer = ring_buffer_alloc(ring_buf_size, | 4564 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); |
4579 | TRACE_BUFFER_FLAGS); | ||
4580 | if (!max_tr.buffer) { | 4565 | if (!max_tr.buffer) { |
4581 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4566 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
4582 | WARN_ON(1); | 4567 | WARN_ON(1); |
4583 | ring_buffer_free(global_trace.buffer); | 4568 | ring_buffer_free(global_trace.buffer); |
4584 | goto out_free_cpumask; | 4569 | goto out_free_cpumask; |
4585 | } | 4570 | } |
4586 | max_tr.entries = ring_buffer_size(max_tr.buffer); | 4571 | max_tr.entries = 1; |
4587 | WARN_ON(max_tr.entries != global_trace.entries); | ||
4588 | #endif | 4572 | #endif |
4589 | 4573 | ||
4590 | /* Allocate the first page for all buffers */ | 4574 | /* Allocate the first page for all buffers */ |
@@ -4597,9 +4581,6 @@ __init static int tracer_alloc_buffers(void) | |||
4597 | 4581 | ||
4598 | register_tracer(&nop_trace); | 4582 | register_tracer(&nop_trace); |
4599 | current_trace = &nop_trace; | 4583 | current_trace = &nop_trace; |
4600 | #ifdef CONFIG_BOOT_TRACER | ||
4601 | register_tracer(&boot_tracer); | ||
4602 | #endif | ||
4603 | /* All seems OK, enable tracing */ | 4584 | /* All seems OK, enable tracing */ |
4604 | tracing_disabled = 0; | 4585 | tracing_disabled = 0; |
4605 | 4586 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463f..d39b3c5454a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -9,10 +9,7 @@ | |||
9 | #include <linux/mmiotrace.h> | 9 | #include <linux/mmiotrace.h> |
10 | #include <linux/tracepoint.h> | 10 | #include <linux/tracepoint.h> |
11 | #include <linux/ftrace.h> | 11 | #include <linux/ftrace.h> |
12 | #include <trace/boot.h> | ||
13 | #include <linux/kmemtrace.h> | ||
14 | #include <linux/hw_breakpoint.h> | 12 | #include <linux/hw_breakpoint.h> |
15 | |||
16 | #include <linux/trace_seq.h> | 13 | #include <linux/trace_seq.h> |
17 | #include <linux/ftrace_event.h> | 14 | #include <linux/ftrace_event.h> |
18 | 15 | ||
@@ -25,30 +22,17 @@ enum trace_type { | |||
25 | TRACE_STACK, | 22 | TRACE_STACK, |
26 | TRACE_PRINT, | 23 | TRACE_PRINT, |
27 | TRACE_BPRINT, | 24 | TRACE_BPRINT, |
28 | TRACE_SPECIAL, | ||
29 | TRACE_MMIO_RW, | 25 | TRACE_MMIO_RW, |
30 | TRACE_MMIO_MAP, | 26 | TRACE_MMIO_MAP, |
31 | TRACE_BRANCH, | 27 | TRACE_BRANCH, |
32 | TRACE_BOOT_CALL, | ||
33 | TRACE_BOOT_RET, | ||
34 | TRACE_GRAPH_RET, | 28 | TRACE_GRAPH_RET, |
35 | TRACE_GRAPH_ENT, | 29 | TRACE_GRAPH_ENT, |
36 | TRACE_USER_STACK, | 30 | TRACE_USER_STACK, |
37 | TRACE_KMEM_ALLOC, | ||
38 | TRACE_KMEM_FREE, | ||
39 | TRACE_BLK, | 31 | TRACE_BLK, |
40 | TRACE_KSYM, | ||
41 | 32 | ||
42 | __TRACE_LAST_TYPE, | 33 | __TRACE_LAST_TYPE, |
43 | }; | 34 | }; |
44 | 35 | ||
45 | enum kmemtrace_type_id { | ||
46 | KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ | ||
47 | KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ | ||
48 | KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ | ||
49 | }; | ||
50 | |||
51 | extern struct tracer boot_tracer; | ||
52 | 36 | ||
53 | #undef __field | 37 | #undef __field |
54 | #define __field(type, item) type item; | 38 | #define __field(type, item) type item; |
@@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); | |||
204 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ | 188 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ |
205 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 189 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
206 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 190 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
207 | IF_ASSIGN(var, ent, struct special_entry, 0); \ | ||
208 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 191 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
209 | TRACE_MMIO_RW); \ | 192 | TRACE_MMIO_RW); \ |
210 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 193 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
211 | TRACE_MMIO_MAP); \ | 194 | TRACE_MMIO_MAP); \ |
212 | IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ | ||
213 | IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ | ||
214 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ | 195 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ |
215 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ | 196 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ |
216 | TRACE_GRAPH_ENT); \ | 197 | TRACE_GRAPH_ENT); \ |
217 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ | 198 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ |
218 | TRACE_GRAPH_RET); \ | 199 | TRACE_GRAPH_RET); \ |
219 | IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ | ||
220 | TRACE_KMEM_ALLOC); \ | ||
221 | IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ | ||
222 | TRACE_KMEM_FREE); \ | ||
223 | IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ | ||
224 | __ftrace_bad_type(); \ | 200 | __ftrace_bad_type(); \ |
225 | } while (0) | 201 | } while (0) |
226 | 202 | ||
@@ -298,6 +274,7 @@ struct tracer { | |||
298 | struct tracer *next; | 274 | struct tracer *next; |
299 | int print_max; | 275 | int print_max; |
300 | struct tracer_flags *flags; | 276 | struct tracer_flags *flags; |
277 | int use_max_tr; | ||
301 | }; | 278 | }; |
302 | 279 | ||
303 | 280 | ||
@@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, | |||
318 | const struct file_operations *fops); | 295 | const struct file_operations *fops); |
319 | 296 | ||
320 | struct dentry *tracing_init_dentry(void); | 297 | struct dentry *tracing_init_dentry(void); |
321 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer); | ||
322 | 298 | ||
323 | struct ring_buffer_event; | 299 | struct ring_buffer_event; |
324 | 300 | ||
@@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, | |||
338 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, | 314 | struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, |
339 | int *ent_cpu, u64 *ent_ts); | 315 | int *ent_cpu, u64 *ent_ts); |
340 | 316 | ||
317 | int trace_empty(struct trace_iterator *iter); | ||
318 | |||
319 | void *trace_find_next_entry_inc(struct trace_iterator *iter); | ||
320 | |||
321 | void trace_init_global_iter(struct trace_iterator *iter); | ||
322 | |||
323 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | ||
324 | |||
341 | void default_wait_pipe(struct trace_iterator *iter); | 325 | void default_wait_pipe(struct trace_iterator *iter); |
342 | void poll_wait_pipe(struct trace_iterator *iter); | 326 | void poll_wait_pipe(struct trace_iterator *iter); |
343 | 327 | ||
@@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, | |||
355 | struct task_struct *wakee, | 339 | struct task_struct *wakee, |
356 | struct task_struct *cur, | 340 | struct task_struct *cur, |
357 | unsigned long flags, int pc); | 341 | unsigned long flags, int pc); |
358 | void trace_special(struct trace_array *tr, | ||
359 | struct trace_array_cpu *data, | ||
360 | unsigned long arg1, | ||
361 | unsigned long arg2, | ||
362 | unsigned long arg3, int pc); | ||
363 | void trace_function(struct trace_array *tr, | 342 | void trace_function(struct trace_array *tr, |
364 | unsigned long ip, | 343 | unsigned long ip, |
365 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
@@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void); | |||
380 | int register_tracer(struct tracer *type); | 359 | int register_tracer(struct tracer *type); |
381 | void unregister_tracer(struct tracer *type); | 360 | void unregister_tracer(struct tracer *type); |
382 | int is_tracing_stopped(void); | 361 | int is_tracing_stopped(void); |
362 | enum trace_file_type { | ||
363 | TRACE_FILE_LAT_FMT = 1, | ||
364 | TRACE_FILE_ANNOTATE = 2, | ||
365 | }; | ||
366 | |||
367 | extern cpumask_var_t __read_mostly tracing_buffer_mask; | ||
383 | 368 | ||
384 | extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); | 369 | #define for_each_tracing_cpu(cpu) \ |
370 | for_each_cpu(cpu, tracing_buffer_mask) | ||
385 | 371 | ||
386 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); | 372 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); |
387 | 373 | ||
@@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, | |||
452 | struct trace_array *tr); | 438 | struct trace_array *tr); |
453 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, | 439 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, |
454 | struct trace_array *tr); | 440 | struct trace_array *tr); |
455 | extern int trace_selftest_startup_sysprof(struct tracer *trace, | ||
456 | struct trace_array *tr); | ||
457 | extern int trace_selftest_startup_branch(struct tracer *trace, | 441 | extern int trace_selftest_startup_branch(struct tracer *trace, |
458 | struct trace_array *tr); | 442 | struct trace_array *tr); |
459 | extern int trace_selftest_startup_ksym(struct tracer *trace, | ||
460 | struct trace_array *tr); | ||
461 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | 443 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ |
462 | 444 | ||
463 | extern void *head_page(struct trace_array_cpu *data); | 445 | extern void *head_page(struct trace_array_cpu *data); |
@@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr, | |||
471 | unsigned long ip, const char *fmt, va_list args); | 453 | unsigned long ip, const char *fmt, va_list args); |
472 | int trace_array_printk(struct trace_array *tr, | 454 | int trace_array_printk(struct trace_array *tr, |
473 | unsigned long ip, const char *fmt, ...); | 455 | unsigned long ip, const char *fmt, ...); |
456 | void trace_printk_seq(struct trace_seq *s); | ||
457 | enum print_line_t print_trace_line(struct trace_iterator *iter); | ||
474 | 458 | ||
475 | extern unsigned long trace_flags; | 459 | extern unsigned long trace_flags; |
476 | 460 | ||
@@ -617,6 +601,7 @@ enum trace_iterator_flags { | |||
617 | TRACE_ITER_LATENCY_FMT = 0x20000, | 601 | TRACE_ITER_LATENCY_FMT = 0x20000, |
618 | TRACE_ITER_SLEEP_TIME = 0x40000, | 602 | TRACE_ITER_SLEEP_TIME = 0x40000, |
619 | TRACE_ITER_GRAPH_TIME = 0x80000, | 603 | TRACE_ITER_GRAPH_TIME = 0x80000, |
604 | TRACE_ITER_RECORD_CMD = 0x100000, | ||
620 | }; | 605 | }; |
621 | 606 | ||
622 | /* | 607 | /* |
@@ -628,54 +613,6 @@ enum trace_iterator_flags { | |||
628 | 613 | ||
629 | extern struct tracer nop_trace; | 614 | extern struct tracer nop_trace; |
630 | 615 | ||
631 | /** | ||
632 | * ftrace_preempt_disable - disable preemption scheduler safe | ||
633 | * | ||
634 | * When tracing can happen inside the scheduler, there exists | ||
635 | * cases that the tracing might happen before the need_resched | ||
636 | * flag is checked. If this happens and the tracer calls | ||
637 | * preempt_enable (after a disable), a schedule might take place | ||
638 | * causing an infinite recursion. | ||
639 | * | ||
640 | * To prevent this, we read the need_resched flag before | ||
641 | * disabling preemption. When we want to enable preemption we | ||
642 | * check the flag, if it is set, then we call preempt_enable_no_resched. | ||
643 | * Otherwise, we call preempt_enable. | ||
644 | * | ||
645 | * The rational for doing the above is that if need_resched is set | ||
646 | * and we have yet to reschedule, we are either in an atomic location | ||
647 | * (where we do not need to check for scheduling) or we are inside | ||
648 | * the scheduler and do not want to resched. | ||
649 | */ | ||
650 | static inline int ftrace_preempt_disable(void) | ||
651 | { | ||
652 | int resched; | ||
653 | |||
654 | resched = need_resched(); | ||
655 | preempt_disable_notrace(); | ||
656 | |||
657 | return resched; | ||
658 | } | ||
659 | |||
660 | /** | ||
661 | * ftrace_preempt_enable - enable preemption scheduler safe | ||
662 | * @resched: the return value from ftrace_preempt_disable | ||
663 | * | ||
664 | * This is a scheduler safe way to enable preemption and not miss | ||
665 | * any preemption checks. The disabled saved the state of preemption. | ||
666 | * If resched is set, then we are either inside an atomic or | ||
667 | * are inside the scheduler (we would have already scheduled | ||
668 | * otherwise). In this case, we do not want to call normal | ||
669 | * preempt_enable, but preempt_enable_no_resched instead. | ||
670 | */ | ||
671 | static inline void ftrace_preempt_enable(int resched) | ||
672 | { | ||
673 | if (resched) | ||
674 | preempt_enable_no_resched_notrace(); | ||
675 | else | ||
676 | preempt_enable_notrace(); | ||
677 | } | ||
678 | |||
679 | #ifdef CONFIG_BRANCH_TRACER | 616 | #ifdef CONFIG_BRANCH_TRACER |
680 | extern int enable_branch_tracing(struct trace_array *tr); | 617 | extern int enable_branch_tracing(struct trace_array *tr); |
681 | extern void disable_branch_tracing(void); | 618 | extern void disable_branch_tracing(void); |
@@ -766,6 +703,8 @@ struct filter_pred { | |||
766 | int pop_n; | 703 | int pop_n; |
767 | }; | 704 | }; |
768 | 705 | ||
706 | extern struct list_head ftrace_common_fields; | ||
707 | |||
769 | extern enum regex_type | 708 | extern enum regex_type |
770 | filter_parse_regex(char *buff, int len, char **search, int *not); | 709 | filter_parse_regex(char *buff, int len, char **search, int *not); |
771 | extern void print_event_filter(struct ftrace_event_call *call, | 710 | extern void print_event_filter(struct ftrace_event_call *call, |
@@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
795 | return 0; | 734 | return 0; |
796 | } | 735 | } |
797 | 736 | ||
737 | extern void trace_event_enable_cmd_record(bool enable); | ||
738 | |||
798 | extern struct mutex event_mutex; | 739 | extern struct mutex event_mutex; |
799 | extern struct list_head ftrace_events; | 740 | extern struct list_head ftrace_events; |
800 | 741 | ||
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956ad..000000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null | |||
@@ -1,185 +0,0 @@ | |||
1 | /* | ||
2 | * ring buffer based initcalls tracer | ||
3 | * | ||
4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/ftrace.h> | ||
11 | #include <linux/kallsyms.h> | ||
12 | #include <linux/time.h> | ||
13 | |||
14 | #include "trace.h" | ||
15 | #include "trace_output.h" | ||
16 | |||
17 | static struct trace_array *boot_trace; | ||
18 | static bool pre_initcalls_finished; | ||
19 | |||
20 | /* Tells the boot tracer that the pre_smp_initcalls are finished. | ||
21 | * So we are ready . | ||
22 | * It doesn't enable sched events tracing however. | ||
23 | * You have to call enable_boot_trace to do so. | ||
24 | */ | ||
25 | void start_boot_trace(void) | ||
26 | { | ||
27 | pre_initcalls_finished = true; | ||
28 | } | ||
29 | |||
30 | void enable_boot_trace(void) | ||
31 | { | ||
32 | if (boot_trace && pre_initcalls_finished) | ||
33 | tracing_start_sched_switch_record(); | ||
34 | } | ||
35 | |||
36 | void disable_boot_trace(void) | ||
37 | { | ||
38 | if (boot_trace && pre_initcalls_finished) | ||
39 | tracing_stop_sched_switch_record(); | ||
40 | } | ||
41 | |||
42 | static int boot_trace_init(struct trace_array *tr) | ||
43 | { | ||
44 | boot_trace = tr; | ||
45 | |||
46 | if (!tr) | ||
47 | return 0; | ||
48 | |||
49 | tracing_reset_online_cpus(tr); | ||
50 | |||
51 | tracing_sched_switch_assign_trace(tr); | ||
52 | return 0; | ||
53 | } | ||
54 | |||
55 | static enum print_line_t | ||
56 | initcall_call_print_line(struct trace_iterator *iter) | ||
57 | { | ||
58 | struct trace_entry *entry = iter->ent; | ||
59 | struct trace_seq *s = &iter->seq; | ||
60 | struct trace_boot_call *field; | ||
61 | struct boot_trace_call *call; | ||
62 | u64 ts; | ||
63 | unsigned long nsec_rem; | ||
64 | int ret; | ||
65 | |||
66 | trace_assign_type(field, entry); | ||
67 | call = &field->boot_call; | ||
68 | ts = iter->ts; | ||
69 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
70 | |||
71 | ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", | ||
72 | (unsigned long)ts, nsec_rem, call->func, call->caller); | ||
73 | |||
74 | if (!ret) | ||
75 | return TRACE_TYPE_PARTIAL_LINE; | ||
76 | else | ||
77 | return TRACE_TYPE_HANDLED; | ||
78 | } | ||
79 | |||
80 | static enum print_line_t | ||
81 | initcall_ret_print_line(struct trace_iterator *iter) | ||
82 | { | ||
83 | struct trace_entry *entry = iter->ent; | ||
84 | struct trace_seq *s = &iter->seq; | ||
85 | struct trace_boot_ret *field; | ||
86 | struct boot_trace_ret *init_ret; | ||
87 | u64 ts; | ||
88 | unsigned long nsec_rem; | ||
89 | int ret; | ||
90 | |||
91 | trace_assign_type(field, entry); | ||
92 | init_ret = &field->boot_ret; | ||
93 | ts = iter->ts; | ||
94 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
95 | |||
96 | ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " | ||
97 | "returned %d after %llu msecs\n", | ||
98 | (unsigned long) ts, | ||
99 | nsec_rem, | ||
100 | init_ret->func, init_ret->result, init_ret->duration); | ||
101 | |||
102 | if (!ret) | ||
103 | return TRACE_TYPE_PARTIAL_LINE; | ||
104 | else | ||
105 | return TRACE_TYPE_HANDLED; | ||
106 | } | ||
107 | |||
108 | static enum print_line_t initcall_print_line(struct trace_iterator *iter) | ||
109 | { | ||
110 | struct trace_entry *entry = iter->ent; | ||
111 | |||
112 | switch (entry->type) { | ||
113 | case TRACE_BOOT_CALL: | ||
114 | return initcall_call_print_line(iter); | ||
115 | case TRACE_BOOT_RET: | ||
116 | return initcall_ret_print_line(iter); | ||
117 | default: | ||
118 | return TRACE_TYPE_UNHANDLED; | ||
119 | } | ||
120 | } | ||
121 | |||
122 | struct tracer boot_tracer __read_mostly = | ||
123 | { | ||
124 | .name = "initcall", | ||
125 | .init = boot_trace_init, | ||
126 | .reset = tracing_reset_online_cpus, | ||
127 | .print_line = initcall_print_line, | ||
128 | }; | ||
129 | |||
130 | void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) | ||
131 | { | ||
132 | struct ftrace_event_call *call = &event_boot_call; | ||
133 | struct ring_buffer_event *event; | ||
134 | struct ring_buffer *buffer; | ||
135 | struct trace_boot_call *entry; | ||
136 | struct trace_array *tr = boot_trace; | ||
137 | |||
138 | if (!tr || !pre_initcalls_finished) | ||
139 | return; | ||
140 | |||
141 | /* Get its name now since this function could | ||
142 | * disappear because it is in the .init section. | ||
143 | */ | ||
144 | sprint_symbol(bt->func, (unsigned long)fn); | ||
145 | preempt_disable(); | ||
146 | |||
147 | buffer = tr->buffer; | ||
148 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, | ||
149 | sizeof(*entry), 0, 0); | ||
150 | if (!event) | ||
151 | goto out; | ||
152 | entry = ring_buffer_event_data(event); | ||
153 | entry->boot_call = *bt; | ||
154 | if (!filter_check_discard(call, entry, buffer, event)) | ||
155 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
156 | out: | ||
157 | preempt_enable(); | ||
158 | } | ||
159 | |||
160 | void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) | ||
161 | { | ||
162 | struct ftrace_event_call *call = &event_boot_ret; | ||
163 | struct ring_buffer_event *event; | ||
164 | struct ring_buffer *buffer; | ||
165 | struct trace_boot_ret *entry; | ||
166 | struct trace_array *tr = boot_trace; | ||
167 | |||
168 | if (!tr || !pre_initcalls_finished) | ||
169 | return; | ||
170 | |||
171 | sprint_symbol(bt->func, (unsigned long)fn); | ||
172 | preempt_disable(); | ||
173 | |||
174 | buffer = tr->buffer; | ||
175 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, | ||
176 | sizeof(*entry), 0, 0); | ||
177 | if (!event) | ||
178 | goto out; | ||
179 | entry = ring_buffer_event_data(event); | ||
180 | entry->boot_ret = *bt; | ||
181 | if (!filter_check_discard(call, entry, buffer, event)) | ||
182 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
183 | out: | ||
184 | preempt_enable(); | ||
185 | } | ||
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..685a67d55db0 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -32,16 +32,15 @@ | |||
32 | u64 notrace trace_clock_local(void) | 32 | u64 notrace trace_clock_local(void) |
33 | { | 33 | { |
34 | u64 clock; | 34 | u64 clock; |
35 | int resched; | ||
36 | 35 | ||
37 | /* | 36 | /* |
38 | * sched_clock() is an architecture implemented, fast, scalable, | 37 | * sched_clock() is an architecture implemented, fast, scalable, |
39 | * lockless clock. It is not guaranteed to be coherent across | 38 | * lockless clock. It is not guaranteed to be coherent across |
40 | * CPUs, nor across CPU idle events. | 39 | * CPUs, nor across CPU idle events. |
41 | */ | 40 | */ |
42 | resched = ftrace_preempt_disable(); | 41 | preempt_disable_notrace(); |
43 | clock = sched_clock(); | 42 | clock = sched_clock(); |
44 | ftrace_preempt_enable(resched); | 43 | preempt_enable_notrace(); |
45 | 44 | ||
46 | return clock; | 45 | return clock; |
47 | } | 46 | } |
@@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) | |||
56 | */ | 55 | */ |
57 | u64 notrace trace_clock(void) | 56 | u64 notrace trace_clock(void) |
58 | { | 57 | { |
59 | return cpu_clock(raw_smp_processor_id()); | 58 | return local_clock(); |
60 | } | 59 | } |
61 | 60 | ||
62 | 61 | ||
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240da..e3dfecaf13e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, | |||
151 | ); | 151 | ); |
152 | 152 | ||
153 | /* | 153 | /* |
154 | * Special (free-form) trace entry: | ||
155 | */ | ||
156 | FTRACE_ENTRY(special, special_entry, | ||
157 | |||
158 | TRACE_SPECIAL, | ||
159 | |||
160 | F_STRUCT( | ||
161 | __field( unsigned long, arg1 ) | ||
162 | __field( unsigned long, arg2 ) | ||
163 | __field( unsigned long, arg3 ) | ||
164 | ), | ||
165 | |||
166 | F_printk("(%08lx) (%08lx) (%08lx)", | ||
167 | __entry->arg1, __entry->arg2, __entry->arg3) | ||
168 | ); | ||
169 | |||
170 | /* | ||
171 | * Stack-trace entry: | 154 | * Stack-trace entry: |
172 | */ | 155 | */ |
173 | 156 | ||
@@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, | |||
271 | __entry->map_id, __entry->opcode) | 254 | __entry->map_id, __entry->opcode) |
272 | ); | 255 | ); |
273 | 256 | ||
274 | FTRACE_ENTRY(boot_call, trace_boot_call, | ||
275 | |||
276 | TRACE_BOOT_CALL, | ||
277 | |||
278 | F_STRUCT( | ||
279 | __field_struct( struct boot_trace_call, boot_call ) | ||
280 | __field_desc( pid_t, boot_call, caller ) | ||
281 | __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) | ||
282 | ), | ||
283 | |||
284 | F_printk("%d %s", __entry->caller, __entry->func) | ||
285 | ); | ||
286 | |||
287 | FTRACE_ENTRY(boot_ret, trace_boot_ret, | ||
288 | |||
289 | TRACE_BOOT_RET, | ||
290 | |||
291 | F_STRUCT( | ||
292 | __field_struct( struct boot_trace_ret, boot_ret ) | ||
293 | __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) | ||
294 | __field_desc( int, boot_ret, result ) | ||
295 | __field_desc( unsigned long, boot_ret, duration ) | ||
296 | ), | ||
297 | |||
298 | F_printk("%s %d %lx", | ||
299 | __entry->func, __entry->result, __entry->duration) | ||
300 | ); | ||
301 | 257 | ||
302 | #define TRACE_FUNC_SIZE 30 | 258 | #define TRACE_FUNC_SIZE 30 |
303 | #define TRACE_FILE_SIZE 20 | 259 | #define TRACE_FILE_SIZE 20 |
@@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, | |||
318 | __entry->func, __entry->file, __entry->correct) | 274 | __entry->func, __entry->file, __entry->correct) |
319 | ); | 275 | ); |
320 | 276 | ||
321 | FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, | ||
322 | |||
323 | TRACE_KMEM_ALLOC, | ||
324 | |||
325 | F_STRUCT( | ||
326 | __field( enum kmemtrace_type_id, type_id ) | ||
327 | __field( unsigned long, call_site ) | ||
328 | __field( const void *, ptr ) | ||
329 | __field( size_t, bytes_req ) | ||
330 | __field( size_t, bytes_alloc ) | ||
331 | __field( gfp_t, gfp_flags ) | ||
332 | __field( int, node ) | ||
333 | ), | ||
334 | |||
335 | F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" | ||
336 | " flags:%x node:%d", | ||
337 | __entry->type_id, __entry->call_site, __entry->ptr, | ||
338 | __entry->bytes_req, __entry->bytes_alloc, | ||
339 | __entry->gfp_flags, __entry->node) | ||
340 | ); | ||
341 | |||
342 | FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, | ||
343 | |||
344 | TRACE_KMEM_FREE, | ||
345 | |||
346 | F_STRUCT( | ||
347 | __field( enum kmemtrace_type_id, type_id ) | ||
348 | __field( unsigned long, call_site ) | ||
349 | __field( const void *, ptr ) | ||
350 | ), | ||
351 | |||
352 | F_printk("type:%u call_site:%lx ptr:%p", | ||
353 | __entry->type_id, __entry->call_site, __entry->ptr) | ||
354 | ); | ||
355 | |||
356 | FTRACE_ENTRY(ksym_trace, ksym_trace_entry, | ||
357 | |||
358 | TRACE_KSYM, | ||
359 | |||
360 | F_STRUCT( | ||
361 | __field( unsigned long, ip ) | ||
362 | __field( unsigned char, type ) | ||
363 | __array( char , cmd, TASK_COMM_LEN ) | ||
364 | __field( unsigned long, addr ) | ||
365 | ), | ||
366 | |||
367 | F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", | ||
368 | (void *)__entry->ip, (unsigned int)__entry->type, | ||
369 | (void *)__entry->addr, __entry->cmd) | ||
370 | ); | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f7c068..31cc4cb0dbf2 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -9,8 +9,6 @@ | |||
9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
10 | #include "trace.h" | 10 | #include "trace.h" |
11 | 11 | ||
12 | EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); | ||
13 | |||
14 | static char *perf_trace_buf[4]; | 12 | static char *perf_trace_buf[4]; |
15 | 13 | ||
16 | /* | 14 | /* |
@@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
56 | } | 54 | } |
57 | } | 55 | } |
58 | 56 | ||
59 | if (tp_event->class->reg) | 57 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); |
60 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); | ||
61 | else | ||
62 | ret = tracepoint_probe_register(tp_event->name, | ||
63 | tp_event->class->perf_probe, | ||
64 | tp_event); | ||
65 | |||
66 | if (ret) | 58 | if (ret) |
67 | goto fail; | 59 | goto fail; |
68 | 60 | ||
@@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event) | |||
96 | mutex_lock(&event_mutex); | 88 | mutex_lock(&event_mutex); |
97 | list_for_each_entry(tp_event, &ftrace_events, list) { | 89 | list_for_each_entry(tp_event, &ftrace_events, list) { |
98 | if (tp_event->event.type == event_id && | 90 | if (tp_event->event.type == event_id && |
99 | tp_event->class && | 91 | tp_event->class && tp_event->class->reg && |
100 | (tp_event->class->perf_probe || | ||
101 | tp_event->class->reg) && | ||
102 | try_module_get(tp_event->mod)) { | 92 | try_module_get(tp_event->mod)) { |
103 | ret = perf_trace_event_init(tp_event, p_event); | 93 | ret = perf_trace_event_init(tp_event, p_event); |
94 | if (ret) | ||
95 | module_put(tp_event->mod); | ||
104 | break; | 96 | break; |
105 | } | 97 | } |
106 | } | 98 | } |
@@ -138,18 +130,13 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
138 | if (--tp_event->perf_refcount > 0) | 130 | if (--tp_event->perf_refcount > 0) |
139 | goto out; | 131 | goto out; |
140 | 132 | ||
141 | if (tp_event->class->reg) | 133 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); |
142 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); | ||
143 | else | ||
144 | tracepoint_probe_unregister(tp_event->name, | ||
145 | tp_event->class->perf_probe, | ||
146 | tp_event); | ||
147 | 134 | ||
148 | /* | 135 | /* |
149 | * Ensure our callback won't be called anymore. See | 136 | * Ensure our callback won't be called anymore. The buffers |
150 | * tracepoint_probe_unregister() and __DO_TRACE(). | 137 | * will be freed after that. |
151 | */ | 138 | */ |
152 | synchronize_sched(); | 139 | tracepoint_synchronize_unregister(); |
153 | 140 | ||
154 | free_percpu(tp_event->perf_events); | 141 | free_percpu(tp_event->perf_events); |
155 | tp_event->perf_events = NULL; | 142 | tp_event->perf_events = NULL; |
@@ -161,6 +148,7 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
161 | } | 148 | } |
162 | } | 149 | } |
163 | out: | 150 | out: |
151 | module_put(tp_event->mod); | ||
164 | mutex_unlock(&event_mutex); | 152 | mutex_unlock(&event_mutex); |
165 | } | 153 | } |
166 | 154 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..4c758f146328 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -28,6 +28,7 @@ | |||
28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
29 | 29 | ||
30 | LIST_HEAD(ftrace_events); | 30 | LIST_HEAD(ftrace_events); |
31 | LIST_HEAD(ftrace_common_fields); | ||
31 | 32 | ||
32 | struct list_head * | 33 | struct list_head * |
33 | trace_get_fields(struct ftrace_event_call *event_call) | 34 | trace_get_fields(struct ftrace_event_call *event_call) |
@@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) | |||
37 | return event_call->class->get_fields(event_call); | 38 | return event_call->class->get_fields(event_call); |
38 | } | 39 | } |
39 | 40 | ||
40 | int trace_define_field(struct ftrace_event_call *call, const char *type, | 41 | static int __trace_define_field(struct list_head *head, const char *type, |
41 | const char *name, int offset, int size, int is_signed, | 42 | const char *name, int offset, int size, |
42 | int filter_type) | 43 | int is_signed, int filter_type) |
43 | { | 44 | { |
44 | struct ftrace_event_field *field; | 45 | struct ftrace_event_field *field; |
45 | struct list_head *head; | ||
46 | |||
47 | if (WARN_ON(!call->class)) | ||
48 | return 0; | ||
49 | 46 | ||
50 | field = kzalloc(sizeof(*field), GFP_KERNEL); | 47 | field = kzalloc(sizeof(*field), GFP_KERNEL); |
51 | if (!field) | 48 | if (!field) |
@@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
68 | field->size = size; | 65 | field->size = size; |
69 | field->is_signed = is_signed; | 66 | field->is_signed = is_signed; |
70 | 67 | ||
71 | head = trace_get_fields(call); | ||
72 | list_add(&field->link, head); | 68 | list_add(&field->link, head); |
73 | 69 | ||
74 | return 0; | 70 | return 0; |
@@ -80,17 +76,32 @@ err: | |||
80 | 76 | ||
81 | return -ENOMEM; | 77 | return -ENOMEM; |
82 | } | 78 | } |
79 | |||
80 | int trace_define_field(struct ftrace_event_call *call, const char *type, | ||
81 | const char *name, int offset, int size, int is_signed, | ||
82 | int filter_type) | ||
83 | { | ||
84 | struct list_head *head; | ||
85 | |||
86 | if (WARN_ON(!call->class)) | ||
87 | return 0; | ||
88 | |||
89 | head = trace_get_fields(call); | ||
90 | return __trace_define_field(head, type, name, offset, size, | ||
91 | is_signed, filter_type); | ||
92 | } | ||
83 | EXPORT_SYMBOL_GPL(trace_define_field); | 93 | EXPORT_SYMBOL_GPL(trace_define_field); |
84 | 94 | ||
85 | #define __common_field(type, item) \ | 95 | #define __common_field(type, item) \ |
86 | ret = trace_define_field(call, #type, "common_" #item, \ | 96 | ret = __trace_define_field(&ftrace_common_fields, #type, \ |
87 | offsetof(typeof(ent), item), \ | 97 | "common_" #item, \ |
88 | sizeof(ent.item), \ | 98 | offsetof(typeof(ent), item), \ |
89 | is_signed_type(type), FILTER_OTHER); \ | 99 | sizeof(ent.item), \ |
100 | is_signed_type(type), FILTER_OTHER); \ | ||
90 | if (ret) \ | 101 | if (ret) \ |
91 | return ret; | 102 | return ret; |
92 | 103 | ||
93 | static int trace_define_common_fields(struct ftrace_event_call *call) | 104 | static int trace_define_common_fields(void) |
94 | { | 105 | { |
95 | int ret; | 106 | int ret; |
96 | struct trace_entry ent; | 107 | struct trace_entry ent; |
@@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) | |||
130 | } | 141 | } |
131 | EXPORT_SYMBOL_GPL(trace_event_raw_init); | 142 | EXPORT_SYMBOL_GPL(trace_event_raw_init); |
132 | 143 | ||
144 | int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) | ||
145 | { | ||
146 | switch (type) { | ||
147 | case TRACE_REG_REGISTER: | ||
148 | return tracepoint_probe_register(call->name, | ||
149 | call->class->probe, | ||
150 | call); | ||
151 | case TRACE_REG_UNREGISTER: | ||
152 | tracepoint_probe_unregister(call->name, | ||
153 | call->class->probe, | ||
154 | call); | ||
155 | return 0; | ||
156 | |||
157 | #ifdef CONFIG_PERF_EVENTS | ||
158 | case TRACE_REG_PERF_REGISTER: | ||
159 | return tracepoint_probe_register(call->name, | ||
160 | call->class->perf_probe, | ||
161 | call); | ||
162 | case TRACE_REG_PERF_UNREGISTER: | ||
163 | tracepoint_probe_unregister(call->name, | ||
164 | call->class->perf_probe, | ||
165 | call); | ||
166 | return 0; | ||
167 | #endif | ||
168 | } | ||
169 | return 0; | ||
170 | } | ||
171 | EXPORT_SYMBOL_GPL(ftrace_event_reg); | ||
172 | |||
173 | void trace_event_enable_cmd_record(bool enable) | ||
174 | { | ||
175 | struct ftrace_event_call *call; | ||
176 | |||
177 | mutex_lock(&event_mutex); | ||
178 | list_for_each_entry(call, &ftrace_events, list) { | ||
179 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) | ||
180 | continue; | ||
181 | |||
182 | if (enable) { | ||
183 | tracing_start_cmdline_record(); | ||
184 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | ||
185 | } else { | ||
186 | tracing_stop_cmdline_record(); | ||
187 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | ||
188 | } | ||
189 | } | ||
190 | mutex_unlock(&event_mutex); | ||
191 | } | ||
192 | |||
133 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, | 193 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, |
134 | int enable) | 194 | int enable) |
135 | { | 195 | { |
@@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
139 | case 0: | 199 | case 0: |
140 | if (call->flags & TRACE_EVENT_FL_ENABLED) { | 200 | if (call->flags & TRACE_EVENT_FL_ENABLED) { |
141 | call->flags &= ~TRACE_EVENT_FL_ENABLED; | 201 | call->flags &= ~TRACE_EVENT_FL_ENABLED; |
142 | tracing_stop_cmdline_record(); | 202 | if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { |
143 | if (call->class->reg) | 203 | tracing_stop_cmdline_record(); |
144 | call->class->reg(call, TRACE_REG_UNREGISTER); | 204 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; |
145 | else | 205 | } |
146 | tracepoint_probe_unregister(call->name, | 206 | call->class->reg(call, TRACE_REG_UNREGISTER); |
147 | call->class->probe, | ||
148 | call); | ||
149 | } | 207 | } |
150 | break; | 208 | break; |
151 | case 1: | 209 | case 1: |
152 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { | 210 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { |
153 | tracing_start_cmdline_record(); | 211 | if (trace_flags & TRACE_ITER_RECORD_CMD) { |
154 | if (call->class->reg) | 212 | tracing_start_cmdline_record(); |
155 | ret = call->class->reg(call, TRACE_REG_REGISTER); | 213 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; |
156 | else | 214 | } |
157 | ret = tracepoint_probe_register(call->name, | 215 | ret = call->class->reg(call, TRACE_REG_REGISTER); |
158 | call->class->probe, | ||
159 | call); | ||
160 | if (ret) { | 216 | if (ret) { |
161 | tracing_stop_cmdline_record(); | 217 | tracing_stop_cmdline_record(); |
162 | pr_info("event trace: Could not enable event " | 218 | pr_info("event trace: Could not enable event " |
@@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
194 | mutex_lock(&event_mutex); | 250 | mutex_lock(&event_mutex); |
195 | list_for_each_entry(call, &ftrace_events, list) { | 251 | list_for_each_entry(call, &ftrace_events, list) { |
196 | 252 | ||
197 | if (!call->name || !call->class || | 253 | if (!call->name || !call->class || !call->class->reg) |
198 | (!call->class->probe && !call->class->reg)) | ||
199 | continue; | 254 | continue; |
200 | 255 | ||
201 | if (match && | 256 | if (match && |
@@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
321 | * The ftrace subsystem is for showing formats only. | 376 | * The ftrace subsystem is for showing formats only. |
322 | * They can not be enabled or disabled via the event files. | 377 | * They can not be enabled or disabled via the event files. |
323 | */ | 378 | */ |
324 | if (call->class && (call->class->probe || call->class->reg)) | 379 | if (call->class && call->class->reg) |
325 | return call; | 380 | return call; |
326 | } | 381 | } |
327 | 382 | ||
@@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
474 | 529 | ||
475 | mutex_lock(&event_mutex); | 530 | mutex_lock(&event_mutex); |
476 | list_for_each_entry(call, &ftrace_events, list) { | 531 | list_for_each_entry(call, &ftrace_events, list) { |
477 | if (!call->name || !call->class || | 532 | if (!call->name || !call->class || !call->class->reg) |
478 | (!call->class->probe && !call->class->reg)) | ||
479 | continue; | 533 | continue; |
480 | 534 | ||
481 | if (system && strcmp(call->class->system, system) != 0) | 535 | if (system && strcmp(call->class->system, system) != 0) |
@@ -544,85 +598,165 @@ out: | |||
544 | return ret; | 598 | return ret; |
545 | } | 599 | } |
546 | 600 | ||
547 | static ssize_t | 601 | enum { |
548 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | 602 | FORMAT_HEADER = 1, |
549 | loff_t *ppos) | 603 | FORMAT_PRINTFMT = 2, |
604 | }; | ||
605 | |||
606 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | ||
550 | { | 607 | { |
551 | struct ftrace_event_call *call = filp->private_data; | 608 | struct ftrace_event_call *call = m->private; |
552 | struct ftrace_event_field *field; | 609 | struct ftrace_event_field *field; |
553 | struct list_head *head; | 610 | struct list_head *head; |
554 | struct trace_seq *s; | ||
555 | int common_field_count = 5; | ||
556 | char *buf; | ||
557 | int r = 0; | ||
558 | 611 | ||
559 | if (*ppos) | 612 | (*pos)++; |
560 | return 0; | ||
561 | 613 | ||
562 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 614 | switch ((unsigned long)v) { |
563 | if (!s) | 615 | case FORMAT_HEADER: |
564 | return -ENOMEM; | 616 | head = &ftrace_common_fields; |
565 | 617 | ||
566 | trace_seq_init(s); | 618 | if (unlikely(list_empty(head))) |
619 | return NULL; | ||
567 | 620 | ||
568 | trace_seq_printf(s, "name: %s\n", call->name); | 621 | field = list_entry(head->prev, struct ftrace_event_field, link); |
569 | trace_seq_printf(s, "ID: %d\n", call->event.type); | 622 | return field; |
570 | trace_seq_printf(s, "format:\n"); | 623 | |
624 | case FORMAT_PRINTFMT: | ||
625 | /* all done */ | ||
626 | return NULL; | ||
627 | } | ||
571 | 628 | ||
572 | head = trace_get_fields(call); | 629 | head = trace_get_fields(call); |
573 | list_for_each_entry_reverse(field, head, link) { | ||
574 | /* | ||
575 | * Smartly shows the array type(except dynamic array). | ||
576 | * Normal: | ||
577 | * field:TYPE VAR | ||
578 | * If TYPE := TYPE[LEN], it is shown: | ||
579 | * field:TYPE VAR[LEN] | ||
580 | */ | ||
581 | const char *array_descriptor = strchr(field->type, '['); | ||
582 | 630 | ||
583 | if (!strncmp(field->type, "__data_loc", 10)) | 631 | /* |
584 | array_descriptor = NULL; | 632 | * To separate common fields from event fields, the |
633 | * LSB is set on the first event field. Clear it in case. | ||
634 | */ | ||
635 | v = (void *)((unsigned long)v & ~1L); | ||
585 | 636 | ||
586 | if (!array_descriptor) { | 637 | field = v; |
587 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | 638 | /* |
588 | "\tsize:%u;\tsigned:%d;\n", | 639 | * If this is a common field, and at the end of the list, then |
589 | field->type, field->name, field->offset, | 640 | * continue with main list. |
590 | field->size, !!field->is_signed); | 641 | */ |
591 | } else { | 642 | if (field->link.prev == &ftrace_common_fields) { |
592 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | 643 | if (unlikely(list_empty(head))) |
593 | "\tsize:%u;\tsigned:%d;\n", | 644 | return NULL; |
594 | (int)(array_descriptor - field->type), | 645 | field = list_entry(head->prev, struct ftrace_event_field, link); |
595 | field->type, field->name, | 646 | /* Set the LSB to notify f_show to print an extra newline */ |
596 | array_descriptor, field->offset, | 647 | field = (struct ftrace_event_field *) |
597 | field->size, !!field->is_signed); | 648 | ((unsigned long)field | 1); |
598 | } | 649 | return field; |
650 | } | ||
599 | 651 | ||
600 | if (--common_field_count == 0) | 652 | /* If we are done tell f_show to print the format */ |
601 | r = trace_seq_printf(s, "\n"); | 653 | if (field->link.prev == head) |
654 | return (void *)FORMAT_PRINTFMT; | ||
602 | 655 | ||
603 | if (!r) | 656 | field = list_entry(field->link.prev, struct ftrace_event_field, link); |
604 | break; | ||
605 | } | ||
606 | 657 | ||
607 | if (r) | 658 | return field; |
608 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | 659 | } |
609 | call->print_fmt); | ||
610 | 660 | ||
611 | if (!r) { | 661 | static void *f_start(struct seq_file *m, loff_t *pos) |
612 | /* | 662 | { |
613 | * ug! The format output is bigger than a PAGE!! | 663 | loff_t l = 0; |
614 | */ | 664 | void *p; |
615 | buf = "FORMAT TOO BIG\n"; | 665 | |
616 | r = simple_read_from_buffer(ubuf, cnt, ppos, | 666 | /* Start by showing the header */ |
617 | buf, strlen(buf)); | 667 | if (!*pos) |
618 | goto out; | 668 | return (void *)FORMAT_HEADER; |
669 | |||
670 | p = (void *)FORMAT_HEADER; | ||
671 | do { | ||
672 | p = f_next(m, p, &l); | ||
673 | } while (p && l < *pos); | ||
674 | |||
675 | return p; | ||
676 | } | ||
677 | |||
678 | static int f_show(struct seq_file *m, void *v) | ||
679 | { | ||
680 | struct ftrace_event_call *call = m->private; | ||
681 | struct ftrace_event_field *field; | ||
682 | const char *array_descriptor; | ||
683 | |||
684 | switch ((unsigned long)v) { | ||
685 | case FORMAT_HEADER: | ||
686 | seq_printf(m, "name: %s\n", call->name); | ||
687 | seq_printf(m, "ID: %d\n", call->event.type); | ||
688 | seq_printf(m, "format:\n"); | ||
689 | return 0; | ||
690 | |||
691 | case FORMAT_PRINTFMT: | ||
692 | seq_printf(m, "\nprint fmt: %s\n", | ||
693 | call->print_fmt); | ||
694 | return 0; | ||
619 | } | 695 | } |
620 | 696 | ||
621 | r = simple_read_from_buffer(ubuf, cnt, ppos, | 697 | /* |
622 | s->buffer, s->len); | 698 | * To separate common fields from event fields, the |
623 | out: | 699 | * LSB is set on the first event field. Clear it and |
624 | kfree(s); | 700 | * print a newline if it is set. |
625 | return r; | 701 | */ |
702 | if ((unsigned long)v & 1) { | ||
703 | seq_putc(m, '\n'); | ||
704 | v = (void *)((unsigned long)v & ~1L); | ||
705 | } | ||
706 | |||
707 | field = v; | ||
708 | |||
709 | /* | ||
710 | * Smartly shows the array type(except dynamic array). | ||
711 | * Normal: | ||
712 | * field:TYPE VAR | ||
713 | * If TYPE := TYPE[LEN], it is shown: | ||
714 | * field:TYPE VAR[LEN] | ||
715 | */ | ||
716 | array_descriptor = strchr(field->type, '['); | ||
717 | |||
718 | if (!strncmp(field->type, "__data_loc", 10)) | ||
719 | array_descriptor = NULL; | ||
720 | |||
721 | if (!array_descriptor) | ||
722 | seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", | ||
723 | field->type, field->name, field->offset, | ||
724 | field->size, !!field->is_signed); | ||
725 | else | ||
726 | seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", | ||
727 | (int)(array_descriptor - field->type), | ||
728 | field->type, field->name, | ||
729 | array_descriptor, field->offset, | ||
730 | field->size, !!field->is_signed); | ||
731 | |||
732 | return 0; | ||
733 | } | ||
734 | |||
735 | static void f_stop(struct seq_file *m, void *p) | ||
736 | { | ||
737 | } | ||
738 | |||
739 | static const struct seq_operations trace_format_seq_ops = { | ||
740 | .start = f_start, | ||
741 | .next = f_next, | ||
742 | .stop = f_stop, | ||
743 | .show = f_show, | ||
744 | }; | ||
745 | |||
746 | static int trace_format_open(struct inode *inode, struct file *file) | ||
747 | { | ||
748 | struct ftrace_event_call *call = inode->i_private; | ||
749 | struct seq_file *m; | ||
750 | int ret; | ||
751 | |||
752 | ret = seq_open(file, &trace_format_seq_ops); | ||
753 | if (ret < 0) | ||
754 | return ret; | ||
755 | |||
756 | m = file->private_data; | ||
757 | m->private = call; | ||
758 | |||
759 | return 0; | ||
626 | } | 760 | } |
627 | 761 | ||
628 | static ssize_t | 762 | static ssize_t |
@@ -820,8 +954,10 @@ static const struct file_operations ftrace_enable_fops = { | |||
820 | }; | 954 | }; |
821 | 955 | ||
822 | static const struct file_operations ftrace_event_format_fops = { | 956 | static const struct file_operations ftrace_event_format_fops = { |
823 | .open = tracing_open_generic, | 957 | .open = trace_format_open, |
824 | .read = event_format_read, | 958 | .read = seq_read, |
959 | .llseek = seq_lseek, | ||
960 | .release = seq_release, | ||
825 | }; | 961 | }; |
826 | 962 | ||
827 | static const struct file_operations ftrace_event_id_fops = { | 963 | static const struct file_operations ftrace_event_id_fops = { |
@@ -963,35 +1099,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
963 | return -1; | 1099 | return -1; |
964 | } | 1100 | } |
965 | 1101 | ||
966 | if (call->class->probe || call->class->reg) | 1102 | if (call->class->reg) |
967 | trace_create_file("enable", 0644, call->dir, call, | 1103 | trace_create_file("enable", 0644, call->dir, call, |
968 | enable); | 1104 | enable); |
969 | 1105 | ||
970 | #ifdef CONFIG_PERF_EVENTS | 1106 | #ifdef CONFIG_PERF_EVENTS |
971 | if (call->event.type && (call->class->perf_probe || call->class->reg)) | 1107 | if (call->event.type && call->class->reg) |
972 | trace_create_file("id", 0444, call->dir, call, | 1108 | trace_create_file("id", 0444, call->dir, call, |
973 | id); | 1109 | id); |
974 | #endif | 1110 | #endif |
975 | 1111 | ||
976 | if (call->class->define_fields) { | 1112 | /* |
977 | /* | 1113 | * Other events may have the same class. Only update |
978 | * Other events may have the same class. Only update | 1114 | * the fields if they are not already defined. |
979 | * the fields if they are not already defined. | 1115 | */ |
980 | */ | 1116 | head = trace_get_fields(call); |
981 | head = trace_get_fields(call); | 1117 | if (list_empty(head)) { |
982 | if (list_empty(head)) { | 1118 | ret = call->class->define_fields(call); |
983 | ret = trace_define_common_fields(call); | 1119 | if (ret < 0) { |
984 | if (!ret) | 1120 | pr_warning("Could not initialize trace point" |
985 | ret = call->class->define_fields(call); | 1121 | " events/%s\n", call->name); |
986 | if (ret < 0) { | 1122 | return ret; |
987 | pr_warning("Could not initialize trace point" | ||
988 | " events/%s\n", call->name); | ||
989 | return ret; | ||
990 | } | ||
991 | } | 1123 | } |
992 | trace_create_file("filter", 0644, call->dir, call, | ||
993 | filter); | ||
994 | } | 1124 | } |
1125 | trace_create_file("filter", 0644, call->dir, call, | ||
1126 | filter); | ||
995 | 1127 | ||
996 | trace_create_file("format", 0444, call->dir, call, | 1128 | trace_create_file("format", 0444, call->dir, call, |
997 | format); | 1129 | format); |
@@ -999,11 +1131,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
999 | return 0; | 1131 | return 0; |
1000 | } | 1132 | } |
1001 | 1133 | ||
1002 | static int __trace_add_event_call(struct ftrace_event_call *call) | 1134 | static int |
1135 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | ||
1136 | const struct file_operations *id, | ||
1137 | const struct file_operations *enable, | ||
1138 | const struct file_operations *filter, | ||
1139 | const struct file_operations *format) | ||
1003 | { | 1140 | { |
1004 | struct dentry *d_events; | 1141 | struct dentry *d_events; |
1005 | int ret; | 1142 | int ret; |
1006 | 1143 | ||
1144 | /* The linker may leave blanks */ | ||
1007 | if (!call->name) | 1145 | if (!call->name) |
1008 | return -EINVAL; | 1146 | return -EINVAL; |
1009 | 1147 | ||
@@ -1011,8 +1149,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
1011 | ret = call->class->raw_init(call); | 1149 | ret = call->class->raw_init(call); |
1012 | if (ret < 0) { | 1150 | if (ret < 0) { |
1013 | if (ret != -ENOSYS) | 1151 | if (ret != -ENOSYS) |
1014 | pr_warning("Could not initialize trace " | 1152 | pr_warning("Could not initialize trace events/%s\n", |
1015 | "events/%s\n", call->name); | 1153 | call->name); |
1016 | return ret; | 1154 | return ret; |
1017 | } | 1155 | } |
1018 | } | 1156 | } |
@@ -1021,11 +1159,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
1021 | if (!d_events) | 1159 | if (!d_events) |
1022 | return -ENOENT; | 1160 | return -ENOENT; |
1023 | 1161 | ||
1024 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | 1162 | ret = event_create_dir(call, d_events, id, enable, filter, format); |
1025 | &ftrace_enable_fops, &ftrace_event_filter_fops, | ||
1026 | &ftrace_event_format_fops); | ||
1027 | if (!ret) | 1163 | if (!ret) |
1028 | list_add(&call->list, &ftrace_events); | 1164 | list_add(&call->list, &ftrace_events); |
1165 | call->mod = mod; | ||
1029 | 1166 | ||
1030 | return ret; | 1167 | return ret; |
1031 | } | 1168 | } |
@@ -1035,7 +1172,10 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
1035 | { | 1172 | { |
1036 | int ret; | 1173 | int ret; |
1037 | mutex_lock(&event_mutex); | 1174 | mutex_lock(&event_mutex); |
1038 | ret = __trace_add_event_call(call); | 1175 | ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
1176 | &ftrace_enable_fops, | ||
1177 | &ftrace_event_filter_fops, | ||
1178 | &ftrace_event_format_fops); | ||
1039 | mutex_unlock(&event_mutex); | 1179 | mutex_unlock(&event_mutex); |
1040 | return ret; | 1180 | return ret; |
1041 | } | 1181 | } |
@@ -1152,8 +1292,6 @@ static void trace_module_add_events(struct module *mod) | |||
1152 | { | 1292 | { |
1153 | struct ftrace_module_file_ops *file_ops = NULL; | 1293 | struct ftrace_module_file_ops *file_ops = NULL; |
1154 | struct ftrace_event_call *call, *start, *end; | 1294 | struct ftrace_event_call *call, *start, *end; |
1155 | struct dentry *d_events; | ||
1156 | int ret; | ||
1157 | 1295 | ||
1158 | start = mod->trace_events; | 1296 | start = mod->trace_events; |
1159 | end = mod->trace_events + mod->num_trace_events; | 1297 | end = mod->trace_events + mod->num_trace_events; |
@@ -1161,38 +1299,14 @@ static void trace_module_add_events(struct module *mod) | |||
1161 | if (start == end) | 1299 | if (start == end) |
1162 | return; | 1300 | return; |
1163 | 1301 | ||
1164 | d_events = event_trace_events_dir(); | 1302 | file_ops = trace_create_file_ops(mod); |
1165 | if (!d_events) | 1303 | if (!file_ops) |
1166 | return; | 1304 | return; |
1167 | 1305 | ||
1168 | for_each_event(call, start, end) { | 1306 | for_each_event(call, start, end) { |
1169 | /* The linker may leave blanks */ | 1307 | __trace_add_event_call(call, mod, |
1170 | if (!call->name) | ||
1171 | continue; | ||
1172 | if (call->class->raw_init) { | ||
1173 | ret = call->class->raw_init(call); | ||
1174 | if (ret < 0) { | ||
1175 | if (ret != -ENOSYS) | ||
1176 | pr_warning("Could not initialize trace " | ||
1177 | "point events/%s\n", call->name); | ||
1178 | continue; | ||
1179 | } | ||
1180 | } | ||
1181 | /* | ||
1182 | * This module has events, create file ops for this module | ||
1183 | * if not already done. | ||
1184 | */ | ||
1185 | if (!file_ops) { | ||
1186 | file_ops = trace_create_file_ops(mod); | ||
1187 | if (!file_ops) | ||
1188 | return; | ||
1189 | } | ||
1190 | call->mod = mod; | ||
1191 | ret = event_create_dir(call, d_events, | ||
1192 | &file_ops->id, &file_ops->enable, | 1308 | &file_ops->id, &file_ops->enable, |
1193 | &file_ops->filter, &file_ops->format); | 1309 | &file_ops->filter, &file_ops->format); |
1194 | if (!ret) | ||
1195 | list_add(&call->list, &ftrace_events); | ||
1196 | } | 1310 | } |
1197 | } | 1311 | } |
1198 | 1312 | ||
@@ -1319,25 +1433,14 @@ static __init int event_trace_init(void) | |||
1319 | trace_create_file("enable", 0644, d_events, | 1433 | trace_create_file("enable", 0644, d_events, |
1320 | NULL, &ftrace_system_enable_fops); | 1434 | NULL, &ftrace_system_enable_fops); |
1321 | 1435 | ||
1436 | if (trace_define_common_fields()) | ||
1437 | pr_warning("tracing: Failed to allocate common fields"); | ||
1438 | |||
1322 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1439 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
1323 | /* The linker may leave blanks */ | 1440 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
1324 | if (!call->name) | ||
1325 | continue; | ||
1326 | if (call->class->raw_init) { | ||
1327 | ret = call->class->raw_init(call); | ||
1328 | if (ret < 0) { | ||
1329 | if (ret != -ENOSYS) | ||
1330 | pr_warning("Could not initialize trace " | ||
1331 | "point events/%s\n", call->name); | ||
1332 | continue; | ||
1333 | } | ||
1334 | } | ||
1335 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | ||
1336 | &ftrace_enable_fops, | 1441 | &ftrace_enable_fops, |
1337 | &ftrace_event_filter_fops, | 1442 | &ftrace_event_filter_fops, |
1338 | &ftrace_event_format_fops); | 1443 | &ftrace_event_format_fops); |
1339 | if (!ret) | ||
1340 | list_add(&call->list, &ftrace_events); | ||
1341 | } | 1444 | } |
1342 | 1445 | ||
1343 | while (true) { | 1446 | while (true) { |
@@ -1524,12 +1627,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1524 | struct ftrace_entry *entry; | 1627 | struct ftrace_entry *entry; |
1525 | unsigned long flags; | 1628 | unsigned long flags; |
1526 | long disabled; | 1629 | long disabled; |
1527 | int resched; | ||
1528 | int cpu; | 1630 | int cpu; |
1529 | int pc; | 1631 | int pc; |
1530 | 1632 | ||
1531 | pc = preempt_count(); | 1633 | pc = preempt_count(); |
1532 | resched = ftrace_preempt_disable(); | 1634 | preempt_disable_notrace(); |
1533 | cpu = raw_smp_processor_id(); | 1635 | cpu = raw_smp_processor_id(); |
1534 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); | 1636 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); |
1535 | 1637 | ||
@@ -1551,7 +1653,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
1551 | 1653 | ||
1552 | out: | 1654 | out: |
1553 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1655 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
1554 | ftrace_preempt_enable(resched); | 1656 | preempt_enable_notrace(); |
1555 | } | 1657 | } |
1556 | 1658 | ||
1557 | static struct ftrace_ops trace_ops __initdata = | 1659 | static struct ftrace_ops trace_ops __initdata = |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb32999..36d40104b17f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
497 | } | 497 | } |
498 | 498 | ||
499 | static struct ftrace_event_field * | 499 | static struct ftrace_event_field * |
500 | find_event_field(struct ftrace_event_call *call, char *name) | 500 | __find_event_field(struct list_head *head, char *name) |
501 | { | 501 | { |
502 | struct ftrace_event_field *field; | 502 | struct ftrace_event_field *field; |
503 | struct list_head *head; | ||
504 | 503 | ||
505 | head = trace_get_fields(call); | ||
506 | list_for_each_entry(field, head, link) { | 504 | list_for_each_entry(field, head, link) { |
507 | if (!strcmp(field->name, name)) | 505 | if (!strcmp(field->name, name)) |
508 | return field; | 506 | return field; |
@@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
511 | return NULL; | 509 | return NULL; |
512 | } | 510 | } |
513 | 511 | ||
512 | static struct ftrace_event_field * | ||
513 | find_event_field(struct ftrace_event_call *call, char *name) | ||
514 | { | ||
515 | struct ftrace_event_field *field; | ||
516 | struct list_head *head; | ||
517 | |||
518 | field = __find_event_field(&ftrace_common_fields, name); | ||
519 | if (field) | ||
520 | return field; | ||
521 | |||
522 | head = trace_get_fields(call); | ||
523 | return __find_event_field(head, name); | ||
524 | } | ||
525 | |||
514 | static void filter_free_pred(struct filter_pred *pred) | 526 | static void filter_free_pred(struct filter_pred *pred) |
515 | { | 527 | { |
516 | if (!pred) | 528 | if (!pred) |
@@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) | |||
627 | int err; | 639 | int err; |
628 | 640 | ||
629 | list_for_each_entry(call, &ftrace_events, list) { | 641 | list_for_each_entry(call, &ftrace_events, list) { |
630 | if (!call->class || !call->class->define_fields) | ||
631 | continue; | ||
632 | |||
633 | if (strcmp(call->class->system, system->name) != 0) | 642 | if (strcmp(call->class->system, system->name) != 0) |
634 | continue; | 643 | continue; |
635 | 644 | ||
@@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) | |||
646 | struct ftrace_event_call *call; | 655 | struct ftrace_event_call *call; |
647 | 656 | ||
648 | list_for_each_entry(call, &ftrace_events, list) { | 657 | list_for_each_entry(call, &ftrace_events, list) { |
649 | if (!call->class || !call->class->define_fields) | ||
650 | continue; | ||
651 | |||
652 | if (strcmp(call->class->system, system->name) != 0) | 658 | if (strcmp(call->class->system, system->name) != 0) |
653 | continue; | 659 | continue; |
654 | 660 | ||
@@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1251 | list_for_each_entry(call, &ftrace_events, list) { | 1257 | list_for_each_entry(call, &ftrace_events, list) { |
1252 | struct event_filter *filter = call->filter; | 1258 | struct event_filter *filter = call->filter; |
1253 | 1259 | ||
1254 | if (!call->class || !call->class->define_fields) | ||
1255 | continue; | ||
1256 | |||
1257 | if (strcmp(call->class->system, system->name) != 0) | 1260 | if (strcmp(call->class->system, system->name) != 0) |
1258 | continue; | 1261 | continue; |
1259 | 1262 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a65969..4ba44deaac25 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
125 | 125 | ||
126 | #include "trace_entries.h" | 126 | #include "trace_entries.h" |
127 | 127 | ||
128 | static int ftrace_raw_init_event(struct ftrace_event_call *call) | ||
129 | { | ||
130 | INIT_LIST_HEAD(&call->class->fields); | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | #undef __entry | 128 | #undef __entry |
135 | #define __entry REC | 129 | #define __entry REC |
136 | 130 | ||
@@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
158 | struct ftrace_event_class event_class_ftrace_##call = { \ | 152 | struct ftrace_event_class event_class_ftrace_##call = { \ |
159 | .system = __stringify(TRACE_SYSTEM), \ | 153 | .system = __stringify(TRACE_SYSTEM), \ |
160 | .define_fields = ftrace_define_fields_##call, \ | 154 | .define_fields = ftrace_define_fields_##call, \ |
161 | .raw_init = ftrace_raw_init_event, \ | 155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
162 | }; \ | 156 | }; \ |
163 | \ | 157 | \ |
164 | struct ftrace_event_call __used \ | 158 | struct ftrace_event_call __used \ |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
54 | struct trace_array_cpu *data; | 54 | struct trace_array_cpu *data; |
55 | unsigned long flags; | 55 | unsigned long flags; |
56 | long disabled; | 56 | long disabled; |
57 | int cpu, resched; | 57 | int cpu; |
58 | int pc; | 58 | int pc; |
59 | 59 | ||
60 | if (unlikely(!ftrace_function_enabled)) | 60 | if (unlikely(!ftrace_function_enabled)) |
61 | return; | 61 | return; |
62 | 62 | ||
63 | pc = preempt_count(); | 63 | pc = preempt_count(); |
64 | resched = ftrace_preempt_disable(); | 64 | preempt_disable_notrace(); |
65 | local_save_flags(flags); | 65 | local_save_flags(flags); |
66 | cpu = raw_smp_processor_id(); | 66 | cpu = raw_smp_processor_id(); |
67 | data = tr->data[cpu]; | 67 | data = tr->data[cpu]; |
@@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
71 | trace_function(tr, ip, parent_ip, flags, pc); | 71 | trace_function(tr, ip, parent_ip, flags, pc); |
72 | 72 | ||
73 | atomic_dec(&data->disabled); | 73 | atomic_dec(&data->disabled); |
74 | ftrace_preempt_enable(resched); | 74 | preempt_enable_notrace(); |
75 | } | 75 | } |
76 | 76 | ||
77 | static void | 77 | static void |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..6f233698518e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
507 | * if the output fails. | 507 | * if the output fails. |
508 | */ | 508 | */ |
509 | data->ent = *curr; | 509 | data->ent = *curr; |
510 | data->ret = *next; | 510 | /* |
511 | * If the next event is not a return type, then | ||
512 | * we only care about what type it is. Otherwise we can | ||
513 | * safely copy the entire event. | ||
514 | */ | ||
515 | if (next->ent.type == TRACE_GRAPH_RET) | ||
516 | data->ret = *next; | ||
517 | else | ||
518 | data->ret.ent.type = next->ent.type; | ||
511 | } | 519 | } |
512 | } | 520 | } |
513 | 521 | ||
@@ -641,7 +649,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
641 | 649 | ||
642 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 650 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
643 | if (len < 7) { | 651 | if (len < 7) { |
644 | snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); | 652 | snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", |
653 | nsecs_rem); | ||
645 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 654 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
646 | if (!ret) | 655 | if (!ret) |
647 | return TRACE_TYPE_PARTIAL_LINE; | 656 | return TRACE_TYPE_PARTIAL_LINE; |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef4..73a6b0601f2e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
649 | #endif | 649 | #endif |
650 | .open = irqsoff_trace_open, | 650 | .open = irqsoff_trace_open, |
651 | .close = irqsoff_trace_close, | 651 | .close = irqsoff_trace_close, |
652 | .use_max_tr = 1, | ||
652 | }; | 653 | }; |
653 | # define register_irqsoff(trace) register_tracer(&trace) | 654 | # define register_irqsoff(trace) register_tracer(&trace) |
654 | #else | 655 | #else |
@@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
681 | #endif | 682 | #endif |
682 | .open = irqsoff_trace_open, | 683 | .open = irqsoff_trace_open, |
683 | .close = irqsoff_trace_close, | 684 | .close = irqsoff_trace_close, |
685 | .use_max_tr = 1, | ||
684 | }; | 686 | }; |
685 | # define register_preemptoff(trace) register_tracer(&trace) | 687 | # define register_preemptoff(trace) register_tracer(&trace) |
686 | #else | 688 | #else |
@@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
715 | #endif | 717 | #endif |
716 | .open = irqsoff_trace_open, | 718 | .open = irqsoff_trace_open, |
717 | .close = irqsoff_trace_close, | 719 | .close = irqsoff_trace_close, |
720 | .use_max_tr = 1, | ||
718 | }; | 721 | }; |
719 | 722 | ||
720 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 723 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 000000000000..7b8ecd751d93 --- /dev/null +++ b/kernel/trace/trace_kdb.c | |||
@@ -0,0 +1,136 @@ | |||
1 | /* | ||
2 | * kdb helper for dumping the ftrace buffer | ||
3 | * | ||
4 | * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> | ||
5 | * | ||
6 | * ftrace_dump_buf based on ftrace_dump: | ||
7 | * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> | ||
8 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | */ | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kgdb.h> | ||
13 | #include <linux/kdb.h> | ||
14 | #include <linux/ftrace.h> | ||
15 | |||
16 | #include "../debug/kdb/kdb_private.h" | ||
17 | #include "trace.h" | ||
18 | #include "trace_output.h" | ||
19 | |||
20 | static void ftrace_dump_buf(int skip_lines, long cpu_file) | ||
21 | { | ||
22 | /* use static because iter can be a bit big for the stack */ | ||
23 | static struct trace_iterator iter; | ||
24 | unsigned int old_userobj; | ||
25 | int cnt = 0, cpu; | ||
26 | |||
27 | trace_init_global_iter(&iter); | ||
28 | |||
29 | for_each_tracing_cpu(cpu) { | ||
30 | atomic_inc(&iter.tr->data[cpu]->disabled); | ||
31 | } | ||
32 | |||
33 | old_userobj = trace_flags; | ||
34 | |||
35 | /* don't look at user memory in panic mode */ | ||
36 | trace_flags &= ~TRACE_ITER_SYM_USEROBJ; | ||
37 | |||
38 | kdb_printf("Dumping ftrace buffer:\n"); | ||
39 | |||
40 | /* reset all but tr, trace, and overruns */ | ||
41 | memset(&iter.seq, 0, | ||
42 | sizeof(struct trace_iterator) - | ||
43 | offsetof(struct trace_iterator, seq)); | ||
44 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | ||
45 | iter.pos = -1; | ||
46 | |||
47 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
48 | for_each_tracing_cpu(cpu) { | ||
49 | iter.buffer_iter[cpu] = | ||
50 | ring_buffer_read_prepare(iter.tr->buffer, cpu); | ||
51 | ring_buffer_read_start(iter.buffer_iter[cpu]); | ||
52 | tracing_iter_reset(&iter, cpu); | ||
53 | } | ||
54 | } else { | ||
55 | iter.cpu_file = cpu_file; | ||
56 | iter.buffer_iter[cpu_file] = | ||
57 | ring_buffer_read_prepare(iter.tr->buffer, cpu_file); | ||
58 | ring_buffer_read_start(iter.buffer_iter[cpu_file]); | ||
59 | tracing_iter_reset(&iter, cpu_file); | ||
60 | } | ||
61 | if (!trace_empty(&iter)) | ||
62 | trace_find_next_entry_inc(&iter); | ||
63 | while (!trace_empty(&iter)) { | ||
64 | if (!cnt) | ||
65 | kdb_printf("---------------------------------\n"); | ||
66 | cnt++; | ||
67 | |||
68 | if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) | ||
69 | print_trace_line(&iter); | ||
70 | if (!skip_lines) | ||
71 | trace_printk_seq(&iter.seq); | ||
72 | else | ||
73 | skip_lines--; | ||
74 | if (KDB_FLAG(CMD_INTERRUPT)) | ||
75 | goto out; | ||
76 | } | ||
77 | |||
78 | if (!cnt) | ||
79 | kdb_printf(" (ftrace buffer empty)\n"); | ||
80 | else | ||
81 | kdb_printf("---------------------------------\n"); | ||
82 | |||
83 | out: | ||
84 | trace_flags = old_userobj; | ||
85 | |||
86 | for_each_tracing_cpu(cpu) { | ||
87 | atomic_dec(&iter.tr->data[cpu]->disabled); | ||
88 | } | ||
89 | |||
90 | for_each_tracing_cpu(cpu) | ||
91 | if (iter.buffer_iter[cpu]) | ||
92 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * kdb_ftdump - Dump the ftrace log buffer | ||
97 | */ | ||
98 | static int kdb_ftdump(int argc, const char **argv) | ||
99 | { | ||
100 | int skip_lines = 0; | ||
101 | long cpu_file; | ||
102 | char *cp; | ||
103 | |||
104 | if (argc > 2) | ||
105 | return KDB_ARGCOUNT; | ||
106 | |||
107 | if (argc) { | ||
108 | skip_lines = simple_strtol(argv[1], &cp, 0); | ||
109 | if (*cp) | ||
110 | skip_lines = 0; | ||
111 | } | ||
112 | |||
113 | if (argc == 2) { | ||
114 | cpu_file = simple_strtol(argv[2], &cp, 0); | ||
115 | if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || | ||
116 | !cpu_online(cpu_file)) | ||
117 | return KDB_BADINT; | ||
118 | } else { | ||
119 | cpu_file = TRACE_PIPE_ALL_CPU; | ||
120 | } | ||
121 | |||
122 | kdb_trap_printk++; | ||
123 | ftrace_dump_buf(skip_lines, cpu_file); | ||
124 | kdb_trap_printk--; | ||
125 | |||
126 | return 0; | ||
127 | } | ||
128 | |||
129 | static __init int kdb_ftrace_register(void) | ||
130 | { | ||
131 | kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", | ||
132 | "Dump ftrace log", 0, KDB_REPEAT_NONE); | ||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | late_initcall(kdb_ftrace_register); | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..544301d29dee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -30,6 +30,8 @@ | |||
30 | #include <linux/ptrace.h> | 30 | #include <linux/ptrace.h> |
31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
32 | #include <linux/stringify.h> | 32 | #include <linux/stringify.h> |
33 | #include <linux/limits.h> | ||
34 | #include <linux/uaccess.h> | ||
33 | #include <asm/bitsperlong.h> | 35 | #include <asm/bitsperlong.h> |
34 | 36 | ||
35 | #include "trace.h" | 37 | #include "trace.h" |
@@ -38,6 +40,7 @@ | |||
38 | #define MAX_TRACE_ARGS 128 | 40 | #define MAX_TRACE_ARGS 128 |
39 | #define MAX_ARGSTR_LEN 63 | 41 | #define MAX_ARGSTR_LEN 63 |
40 | #define MAX_EVENT_NAME_LEN 64 | 42 | #define MAX_EVENT_NAME_LEN 64 |
43 | #define MAX_STRING_SIZE PATH_MAX | ||
41 | #define KPROBE_EVENT_SYSTEM "kprobes" | 44 | #define KPROBE_EVENT_SYSTEM "kprobes" |
42 | 45 | ||
43 | /* Reserved field names */ | 46 | /* Reserved field names */ |
@@ -58,14 +61,16 @@ const char *reserved_field_names[] = { | |||
58 | }; | 61 | }; |
59 | 62 | ||
60 | /* Printing function type */ | 63 | /* Printing function type */ |
61 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); | 64 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, |
65 | void *); | ||
62 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | 66 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type |
63 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | 67 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type |
64 | 68 | ||
65 | /* Printing in basic type function template */ | 69 | /* Printing in basic type function template */ |
66 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | 70 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ |
67 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 71 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ |
68 | const char *name, void *data)\ | 72 | const char *name, \ |
73 | void *data, void *ent)\ | ||
69 | { \ | 74 | { \ |
70 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | 75 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ |
71 | } \ | 76 | } \ |
@@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | |||
80 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | 85 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) |
81 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | 86 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) |
82 | 87 | ||
88 | /* data_rloc: data relative location, compatible with u32 */ | ||
89 | #define make_data_rloc(len, roffs) \ | ||
90 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
91 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
92 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
93 | |||
94 | static inline void *get_rloc_data(u32 *dl) | ||
95 | { | ||
96 | return (u8 *)dl + get_rloc_offs(*dl); | ||
97 | } | ||
98 | |||
99 | /* For data_loc conversion */ | ||
100 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
101 | { | ||
102 | return (u8 *)ent + get_rloc_offs(*dl); | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Convert data_rloc to data_loc: | ||
107 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
108 | * stores the offset from event entry. | ||
109 | */ | ||
110 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
111 | |||
112 | /* For defining macros, define string/string_size types */ | ||
113 | typedef u32 string; | ||
114 | typedef u32 string_size; | ||
115 | |||
116 | /* Print type function for string type */ | ||
117 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
118 | const char *name, | ||
119 | void *data, void *ent) | ||
120 | { | ||
121 | int len = *(u32 *)data >> 16; | ||
122 | |||
123 | if (!len) | ||
124 | return trace_seq_printf(s, " %s=(fault)", name); | ||
125 | else | ||
126 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
127 | (const char *)get_loc_data(data, ent)); | ||
128 | } | ||
129 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
130 | |||
83 | /* Data fetch function type */ | 131 | /* Data fetch function type */ |
84 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | 132 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); |
85 | 133 | ||
@@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, | |||
94 | return fprm->fn(regs, fprm->data, dest); | 142 | return fprm->fn(regs, fprm->data, dest); |
95 | } | 143 | } |
96 | 144 | ||
97 | #define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type | 145 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type |
98 | /* | 146 | /* |
99 | * Define macro for basic types - we don't need to define s* types, because | 147 | * Define macro for basic types - we don't need to define s* types, because |
100 | * we have to care only about bitwidth at recording time. | 148 | * we have to care only about bitwidth at recording time. |
101 | */ | 149 | */ |
102 | #define DEFINE_BASIC_FETCH_FUNCS(kind) \ | 150 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ |
103 | DEFINE_FETCH_##kind(u8) \ | 151 | DEFINE_FETCH_##method(u8) \ |
104 | DEFINE_FETCH_##kind(u16) \ | 152 | DEFINE_FETCH_##method(u16) \ |
105 | DEFINE_FETCH_##kind(u32) \ | 153 | DEFINE_FETCH_##method(u32) \ |
106 | DEFINE_FETCH_##kind(u64) | 154 | DEFINE_FETCH_##method(u64) |
107 | 155 | ||
108 | #define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ | 156 | #define CHECK_FETCH_FUNCS(method, fn) \ |
109 | ((FETCH_FUNC_NAME(kind, u8) == fn) || \ | 157 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ |
110 | (FETCH_FUNC_NAME(kind, u16) == fn) || \ | 158 | (FETCH_FUNC_NAME(method, u16) == fn) || \ |
111 | (FETCH_FUNC_NAME(kind, u32) == fn) || \ | 159 | (FETCH_FUNC_NAME(method, u32) == fn) || \ |
112 | (FETCH_FUNC_NAME(kind, u64) == fn)) | 160 | (FETCH_FUNC_NAME(method, u64) == fn) || \ |
161 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
162 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
163 | && (fn != NULL)) | ||
113 | 164 | ||
114 | /* Data fetch function templates */ | 165 | /* Data fetch function templates */ |
115 | #define DEFINE_FETCH_reg(type) \ | 166 | #define DEFINE_FETCH_reg(type) \ |
116 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | 167 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ |
117 | void *offset, void *dest) \ | 168 | void *offset, void *dest) \ |
118 | { \ | 169 | { \ |
119 | *(type *)dest = (type)regs_get_register(regs, \ | 170 | *(type *)dest = (type)regs_get_register(regs, \ |
120 | (unsigned int)((unsigned long)offset)); \ | 171 | (unsigned int)((unsigned long)offset)); \ |
121 | } | 172 | } |
122 | DEFINE_BASIC_FETCH_FUNCS(reg) | 173 | DEFINE_BASIC_FETCH_FUNCS(reg) |
174 | /* No string on the register */ | ||
175 | #define fetch_reg_string NULL | ||
176 | #define fetch_reg_string_size NULL | ||
123 | 177 | ||
124 | #define DEFINE_FETCH_stack(type) \ | 178 | #define DEFINE_FETCH_stack(type) \ |
125 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | 179 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ |
@@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | |||
129 | (unsigned int)((unsigned long)offset)); \ | 183 | (unsigned int)((unsigned long)offset)); \ |
130 | } | 184 | } |
131 | DEFINE_BASIC_FETCH_FUNCS(stack) | 185 | DEFINE_BASIC_FETCH_FUNCS(stack) |
186 | /* No string on the stack entry */ | ||
187 | #define fetch_stack_string NULL | ||
188 | #define fetch_stack_string_size NULL | ||
132 | 189 | ||
133 | #define DEFINE_FETCH_retval(type) \ | 190 | #define DEFINE_FETCH_retval(type) \ |
134 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | 191 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ |
@@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | |||
137 | *(type *)dest = (type)regs_return_value(regs); \ | 194 | *(type *)dest = (type)regs_return_value(regs); \ |
138 | } | 195 | } |
139 | DEFINE_BASIC_FETCH_FUNCS(retval) | 196 | DEFINE_BASIC_FETCH_FUNCS(retval) |
197 | /* No string on the retval */ | ||
198 | #define fetch_retval_string NULL | ||
199 | #define fetch_retval_string_size NULL | ||
140 | 200 | ||
141 | #define DEFINE_FETCH_memory(type) \ | 201 | #define DEFINE_FETCH_memory(type) \ |
142 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | 202 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ |
@@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | |||
149 | *(type *)dest = retval; \ | 209 | *(type *)dest = retval; \ |
150 | } | 210 | } |
151 | DEFINE_BASIC_FETCH_FUNCS(memory) | 211 | DEFINE_BASIC_FETCH_FUNCS(memory) |
212 | /* | ||
213 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
214 | * length and relative data location. | ||
215 | */ | ||
216 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
217 | void *addr, void *dest) | ||
218 | { | ||
219 | long ret; | ||
220 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
221 | u8 *dst = get_rloc_data(dest); | ||
222 | u8 *src = addr; | ||
223 | mm_segment_t old_fs = get_fs(); | ||
224 | if (!maxlen) | ||
225 | return; | ||
226 | /* | ||
227 | * Try to get string again, since the string can be changed while | ||
228 | * probing. | ||
229 | */ | ||
230 | set_fs(KERNEL_DS); | ||
231 | pagefault_disable(); | ||
232 | do | ||
233 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
234 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
235 | dst[-1] = '\0'; | ||
236 | pagefault_enable(); | ||
237 | set_fs(old_fs); | ||
238 | |||
239 | if (ret < 0) { /* Failed to fetch string */ | ||
240 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
241 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
242 | } else | ||
243 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
244 | get_rloc_offs(*(u32 *)dest)); | ||
245 | } | ||
246 | /* Return the length of string -- including null terminal byte */ | ||
247 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
248 | void *addr, void *dest) | ||
249 | { | ||
250 | int ret, len = 0; | ||
251 | u8 c; | ||
252 | mm_segment_t old_fs = get_fs(); | ||
253 | |||
254 | set_fs(KERNEL_DS); | ||
255 | pagefault_disable(); | ||
256 | do { | ||
257 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
258 | len++; | ||
259 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
260 | pagefault_enable(); | ||
261 | set_fs(old_fs); | ||
262 | |||
263 | if (ret < 0) /* Failed to check the length */ | ||
264 | *(u32 *)dest = 0; | ||
265 | else | ||
266 | *(u32 *)dest = len; | ||
267 | } | ||
152 | 268 | ||
153 | /* Memory fetching by symbol */ | 269 | /* Memory fetching by symbol */ |
154 | struct symbol_cache { | 270 | struct symbol_cache { |
@@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | |||
203 | *(type *)dest = 0; \ | 319 | *(type *)dest = 0; \ |
204 | } | 320 | } |
205 | DEFINE_BASIC_FETCH_FUNCS(symbol) | 321 | DEFINE_BASIC_FETCH_FUNCS(symbol) |
322 | DEFINE_FETCH_symbol(string) | ||
323 | DEFINE_FETCH_symbol(string_size) | ||
206 | 324 | ||
207 | /* Dereference memory access function */ | 325 | /* Dereference memory access function */ |
208 | struct deref_fetch_param { | 326 | struct deref_fetch_param { |
@@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | |||
224 | *(type *)dest = 0; \ | 342 | *(type *)dest = 0; \ |
225 | } | 343 | } |
226 | DEFINE_BASIC_FETCH_FUNCS(deref) | 344 | DEFINE_BASIC_FETCH_FUNCS(deref) |
345 | DEFINE_FETCH_deref(string) | ||
346 | DEFINE_FETCH_deref(string_size) | ||
227 | 347 | ||
228 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 348 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) |
229 | { | 349 | { |
230 | if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) | 350 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
231 | free_deref_fetch_param(data->orig.data); | 351 | free_deref_fetch_param(data->orig.data); |
232 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) | 352 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) |
233 | free_symbol_cache(data->orig.data); | 353 | free_symbol_cache(data->orig.data); |
234 | kfree(data); | 354 | kfree(data); |
235 | } | 355 | } |
@@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
240 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | 360 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) |
241 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | 361 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) |
242 | 362 | ||
243 | #define ASSIGN_FETCH_FUNC(kind, type) \ | 363 | /* Fetch types */ |
244 | .kind = FETCH_FUNC_NAME(kind, type) | 364 | enum { |
245 | 365 | FETCH_MTD_reg = 0, | |
246 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | 366 | FETCH_MTD_stack, |
247 | {.name = #ptype, \ | 367 | FETCH_MTD_retval, |
248 | .size = sizeof(ftype), \ | 368 | FETCH_MTD_memory, |
249 | .is_signed = sign, \ | 369 | FETCH_MTD_symbol, |
250 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | 370 | FETCH_MTD_deref, |
251 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | 371 | FETCH_MTD_END, |
252 | ASSIGN_FETCH_FUNC(reg, ftype), \ | 372 | }; |
253 | ASSIGN_FETCH_FUNC(stack, ftype), \ | 373 | |
254 | ASSIGN_FETCH_FUNC(retval, ftype), \ | 374 | #define ASSIGN_FETCH_FUNC(method, type) \ |
255 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 375 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) |
256 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 376 | |
257 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 377 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ |
378 | {.name = _name, \ | ||
379 | .size = _size, \ | ||
380 | .is_signed = sign, \ | ||
381 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
382 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
383 | .fmttype = _fmttype, \ | ||
384 | .fetch = { \ | ||
385 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
386 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
387 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
388 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
389 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
390 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
391 | } \ | ||
258 | } | 392 | } |
259 | 393 | ||
394 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
395 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
396 | |||
397 | #define FETCH_TYPE_STRING 0 | ||
398 | #define FETCH_TYPE_STRSIZE 1 | ||
399 | |||
260 | /* Fetch type information table */ | 400 | /* Fetch type information table */ |
261 | static const struct fetch_type { | 401 | static const struct fetch_type { |
262 | const char *name; /* Name of type */ | 402 | const char *name; /* Name of type */ |
@@ -264,14 +404,16 @@ static const struct fetch_type { | |||
264 | int is_signed; /* Signed flag */ | 404 | int is_signed; /* Signed flag */ |
265 | print_type_func_t print; /* Print functions */ | 405 | print_type_func_t print; /* Print functions */ |
266 | const char *fmt; /* Fromat string */ | 406 | const char *fmt; /* Fromat string */ |
407 | const char *fmttype; /* Name in format file */ | ||
267 | /* Fetch functions */ | 408 | /* Fetch functions */ |
268 | fetch_func_t reg; | 409 | fetch_func_t fetch[FETCH_MTD_END]; |
269 | fetch_func_t stack; | ||
270 | fetch_func_t retval; | ||
271 | fetch_func_t memory; | ||
272 | fetch_func_t symbol; | ||
273 | fetch_func_t deref; | ||
274 | } fetch_type_table[] = { | 410 | } fetch_type_table[] = { |
411 | /* Special types */ | ||
412 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
413 | sizeof(u32), 1, "__data_loc char[]"), | ||
414 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
415 | string_size, sizeof(u32), 0, "u32"), | ||
416 | /* Basic types */ | ||
275 | ASSIGN_FETCH_TYPE(u8, u8, 0), | 417 | ASSIGN_FETCH_TYPE(u8, u8, 0), |
276 | ASSIGN_FETCH_TYPE(u16, u16, 0), | 418 | ASSIGN_FETCH_TYPE(u16, u16, 0), |
277 | ASSIGN_FETCH_TYPE(u32, u32, 0), | 419 | ASSIGN_FETCH_TYPE(u32, u32, 0), |
@@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, | |||
302 | *(unsigned long *)dest = kernel_stack_pointer(regs); | 444 | *(unsigned long *)dest = kernel_stack_pointer(regs); |
303 | } | 445 | } |
304 | 446 | ||
447 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
448 | fetch_func_t orig_fn) | ||
449 | { | ||
450 | int i; | ||
451 | |||
452 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
453 | return NULL; /* Only string type needs size function */ | ||
454 | for (i = 0; i < FETCH_MTD_END; i++) | ||
455 | if (type->fetch[i] == orig_fn) | ||
456 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
457 | |||
458 | WARN_ON(1); /* This should not happen */ | ||
459 | return NULL; | ||
460 | } | ||
461 | |||
305 | /** | 462 | /** |
306 | * Kprobe event core functions | 463 | * Kprobe event core functions |
307 | */ | 464 | */ |
308 | 465 | ||
309 | struct probe_arg { | 466 | struct probe_arg { |
310 | struct fetch_param fetch; | 467 | struct fetch_param fetch; |
468 | struct fetch_param fetch_size; | ||
311 | unsigned int offset; /* Offset from argument entry */ | 469 | unsigned int offset; /* Offset from argument entry */ |
312 | const char *name; /* Name of this argument */ | 470 | const char *name; /* Name of this argument */ |
313 | const char *comm; /* Command of this argument */ | 471 | const char *comm; /* Command of this argument */ |
@@ -356,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | |||
356 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | 514 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, |
357 | struct pt_regs *regs); | 515 | struct pt_regs *regs); |
358 | 516 | ||
359 | /* Check the name is good for event/group */ | 517 | /* Check the name is good for event/group/fields */ |
360 | static int check_event_name(const char *name) | 518 | static int is_good_name(const char *name) |
361 | { | 519 | { |
362 | if (!isalpha(*name) && *name != '_') | 520 | if (!isalpha(*name) && *name != '_') |
363 | return 0; | 521 | return 0; |
@@ -399,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
399 | else | 557 | else |
400 | tp->rp.kp.pre_handler = kprobe_dispatcher; | 558 | tp->rp.kp.pre_handler = kprobe_dispatcher; |
401 | 559 | ||
402 | if (!event || !check_event_name(event)) { | 560 | if (!event || !is_good_name(event)) { |
403 | ret = -EINVAL; | 561 | ret = -EINVAL; |
404 | goto error; | 562 | goto error; |
405 | } | 563 | } |
@@ -409,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, | |||
409 | if (!tp->call.name) | 567 | if (!tp->call.name) |
410 | goto error; | 568 | goto error; |
411 | 569 | ||
412 | if (!group || !check_event_name(group)) { | 570 | if (!group || !is_good_name(group)) { |
413 | ret = -EINVAL; | 571 | ret = -EINVAL; |
414 | goto error; | 572 | goto error; |
415 | } | 573 | } |
@@ -429,9 +587,9 @@ error: | |||
429 | 587 | ||
430 | static void free_probe_arg(struct probe_arg *arg) | 588 | static void free_probe_arg(struct probe_arg *arg) |
431 | { | 589 | { |
432 | if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) | 590 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) |
433 | free_deref_fetch_param(arg->fetch.data); | 591 | free_deref_fetch_param(arg->fetch.data); |
434 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) | 592 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
435 | free_symbol_cache(arg->fetch.data); | 593 | free_symbol_cache(arg->fetch.data); |
436 | kfree(arg->name); | 594 | kfree(arg->name); |
437 | kfree(arg->comm); | 595 | kfree(arg->comm); |
@@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
548 | 706 | ||
549 | if (strcmp(arg, "retval") == 0) { | 707 | if (strcmp(arg, "retval") == 0) { |
550 | if (is_return) | 708 | if (is_return) |
551 | f->fn = t->retval; | 709 | f->fn = t->fetch[FETCH_MTD_retval]; |
552 | else | 710 | else |
553 | ret = -EINVAL; | 711 | ret = -EINVAL; |
554 | } else if (strncmp(arg, "stack", 5) == 0) { | 712 | } else if (strncmp(arg, "stack", 5) == 0) { |
@@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
562 | if (ret || param > PARAM_MAX_STACK) | 720 | if (ret || param > PARAM_MAX_STACK) |
563 | ret = -EINVAL; | 721 | ret = -EINVAL; |
564 | else { | 722 | else { |
565 | f->fn = t->stack; | 723 | f->fn = t->fetch[FETCH_MTD_stack]; |
566 | f->data = (void *)param; | 724 | f->data = (void *)param; |
567 | } | 725 | } |
568 | } else | 726 | } else |
@@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
588 | case '%': /* named register */ | 746 | case '%': /* named register */ |
589 | ret = regs_query_register_offset(arg + 1); | 747 | ret = regs_query_register_offset(arg + 1); |
590 | if (ret >= 0) { | 748 | if (ret >= 0) { |
591 | f->fn = t->reg; | 749 | f->fn = t->fetch[FETCH_MTD_reg]; |
592 | f->data = (void *)(unsigned long)ret; | 750 | f->data = (void *)(unsigned long)ret; |
593 | ret = 0; | 751 | ret = 0; |
594 | } | 752 | } |
@@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
598 | ret = strict_strtoul(arg + 1, 0, ¶m); | 756 | ret = strict_strtoul(arg + 1, 0, ¶m); |
599 | if (ret) | 757 | if (ret) |
600 | break; | 758 | break; |
601 | f->fn = t->memory; | 759 | f->fn = t->fetch[FETCH_MTD_memory]; |
602 | f->data = (void *)param; | 760 | f->data = (void *)param; |
603 | } else { | 761 | } else { |
604 | ret = split_symbol_offset(arg + 1, &offset); | 762 | ret = split_symbol_offset(arg + 1, &offset); |
@@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
606 | break; | 764 | break; |
607 | f->data = alloc_symbol_cache(arg + 1, offset); | 765 | f->data = alloc_symbol_cache(arg + 1, offset); |
608 | if (f->data) | 766 | if (f->data) |
609 | f->fn = t->symbol; | 767 | f->fn = t->fetch[FETCH_MTD_symbol]; |
610 | } | 768 | } |
611 | break; | 769 | break; |
612 | case '+': /* deref memory */ | 770 | case '+': /* deref memory */ |
@@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
636 | if (ret) | 794 | if (ret) |
637 | kfree(dprm); | 795 | kfree(dprm); |
638 | else { | 796 | else { |
639 | f->fn = t->deref; | 797 | f->fn = t->fetch[FETCH_MTD_deref]; |
640 | f->data = (void *)dprm; | 798 | f->data = (void *)dprm; |
641 | } | 799 | } |
642 | } | 800 | } |
643 | break; | 801 | break; |
644 | } | 802 | } |
645 | if (!ret && !f->fn) | 803 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ |
804 | pr_info("%s type has no corresponding fetch method.\n", | ||
805 | t->name); | ||
646 | ret = -EINVAL; | 806 | ret = -EINVAL; |
807 | } | ||
647 | return ret; | 808 | return ret; |
648 | } | 809 | } |
649 | 810 | ||
@@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
652 | struct probe_arg *parg, int is_return) | 813 | struct probe_arg *parg, int is_return) |
653 | { | 814 | { |
654 | const char *t; | 815 | const char *t; |
816 | int ret; | ||
655 | 817 | ||
656 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 818 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
657 | pr_info("Argument is too long.: %s\n", arg); | 819 | pr_info("Argument is too long.: %s\n", arg); |
@@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
674 | } | 836 | } |
675 | parg->offset = tp->size; | 837 | parg->offset = tp->size; |
676 | tp->size += parg->type->size; | 838 | tp->size += parg->type->size; |
677 | return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 839 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
840 | if (ret >= 0) { | ||
841 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
842 | parg->fetch.fn); | ||
843 | parg->fetch_size.data = parg->fetch.data; | ||
844 | } | ||
845 | return ret; | ||
678 | } | 846 | } |
679 | 847 | ||
680 | /* Return 1 if name is reserved or already used by another argument */ | 848 | /* Return 1 if name is reserved or already used by another argument */ |
@@ -715,7 +883,7 @@ static int create_trace_probe(int argc, char **argv) | |||
715 | int i, ret = 0; | 883 | int i, ret = 0; |
716 | int is_return = 0, is_delete = 0; | 884 | int is_return = 0, is_delete = 0; |
717 | char *symbol = NULL, *event = NULL, *group = NULL; | 885 | char *symbol = NULL, *event = NULL, *group = NULL; |
718 | char *arg, *tmp; | 886 | char *arg; |
719 | unsigned long offset = 0; | 887 | unsigned long offset = 0; |
720 | void *addr = NULL; | 888 | void *addr = NULL; |
721 | char buf[MAX_EVENT_NAME_LEN]; | 889 | char buf[MAX_EVENT_NAME_LEN]; |
@@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) | |||
757 | pr_info("Delete command needs an event name.\n"); | 925 | pr_info("Delete command needs an event name.\n"); |
758 | return -EINVAL; | 926 | return -EINVAL; |
759 | } | 927 | } |
928 | mutex_lock(&probe_lock); | ||
760 | tp = find_probe_event(event, group); | 929 | tp = find_probe_event(event, group); |
761 | if (!tp) { | 930 | if (!tp) { |
931 | mutex_unlock(&probe_lock); | ||
762 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 932 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
763 | return -ENOENT; | 933 | return -ENOENT; |
764 | } | 934 | } |
765 | /* delete an event */ | 935 | /* delete an event */ |
766 | unregister_trace_probe(tp); | 936 | unregister_trace_probe(tp); |
767 | free_trace_probe(tp); | 937 | free_trace_probe(tp); |
938 | mutex_unlock(&probe_lock); | ||
768 | return 0; | 939 | return 0; |
769 | } | 940 | } |
770 | 941 | ||
@@ -821,26 +992,36 @@ static int create_trace_probe(int argc, char **argv) | |||
821 | /* parse arguments */ | 992 | /* parse arguments */ |
822 | ret = 0; | 993 | ret = 0; |
823 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 994 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
995 | /* Increment count for freeing args in error case */ | ||
996 | tp->nr_args++; | ||
997 | |||
824 | /* Parse argument name */ | 998 | /* Parse argument name */ |
825 | arg = strchr(argv[i], '='); | 999 | arg = strchr(argv[i], '='); |
826 | if (arg) | 1000 | if (arg) { |
827 | *arg++ = '\0'; | 1001 | *arg++ = '\0'; |
828 | else | 1002 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); |
1003 | } else { | ||
829 | arg = argv[i]; | 1004 | arg = argv[i]; |
1005 | /* If argument name is omitted, set "argN" */ | ||
1006 | snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); | ||
1007 | tp->args[i].name = kstrdup(buf, GFP_KERNEL); | ||
1008 | } | ||
830 | 1009 | ||
831 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
832 | if (!tp->args[i].name) { | 1010 | if (!tp->args[i].name) { |
833 | pr_info("Failed to allocate argument%d name '%s'.\n", | 1011 | pr_info("Failed to allocate argument[%d] name.\n", i); |
834 | i, argv[i]); | ||
835 | ret = -ENOMEM; | 1012 | ret = -ENOMEM; |
836 | goto error; | 1013 | goto error; |
837 | } | 1014 | } |
838 | tmp = strchr(tp->args[i].name, ':'); | 1015 | |
839 | if (tmp) | 1016 | if (!is_good_name(tp->args[i].name)) { |
840 | *tmp = '_'; /* convert : to _ */ | 1017 | pr_info("Invalid argument[%d] name: %s\n", |
1018 | i, tp->args[i].name); | ||
1019 | ret = -EINVAL; | ||
1020 | goto error; | ||
1021 | } | ||
841 | 1022 | ||
842 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { | 1023 | if (conflict_field_name(tp->args[i].name, tp->args, i)) { |
843 | pr_info("Argument%d name '%s' conflicts with " | 1024 | pr_info("Argument[%d] name '%s' conflicts with " |
844 | "another field.\n", i, argv[i]); | 1025 | "another field.\n", i, argv[i]); |
845 | ret = -EINVAL; | 1026 | ret = -EINVAL; |
846 | goto error; | 1027 | goto error; |
@@ -849,12 +1030,9 @@ static int create_trace_probe(int argc, char **argv) | |||
849 | /* Parse fetch argument */ | 1030 | /* Parse fetch argument */ |
850 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); | 1031 | ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); |
851 | if (ret) { | 1032 | if (ret) { |
852 | pr_info("Parse error at argument%d. (%d)\n", i, ret); | 1033 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); |
853 | kfree(tp->args[i].name); | ||
854 | goto error; | 1034 | goto error; |
855 | } | 1035 | } |
856 | |||
857 | tp->nr_args++; | ||
858 | } | 1036 | } |
859 | 1037 | ||
860 | ret = register_trace_probe(tp); | 1038 | ret = register_trace_probe(tp); |
@@ -1043,6 +1221,54 @@ static const struct file_operations kprobe_profile_ops = { | |||
1043 | .release = seq_release, | 1221 | .release = seq_release, |
1044 | }; | 1222 | }; |
1045 | 1223 | ||
1224 | /* Sum up total data length for dynamic arraies (strings) */ | ||
1225 | static __kprobes int __get_data_size(struct trace_probe *tp, | ||
1226 | struct pt_regs *regs) | ||
1227 | { | ||
1228 | int i, ret = 0; | ||
1229 | u32 len; | ||
1230 | |||
1231 | for (i = 0; i < tp->nr_args; i++) | ||
1232 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
1233 | call_fetch(&tp->args[i].fetch_size, regs, &len); | ||
1234 | ret += len; | ||
1235 | } | ||
1236 | |||
1237 | return ret; | ||
1238 | } | ||
1239 | |||
1240 | /* Store the value of each argument */ | ||
1241 | static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, | ||
1242 | struct pt_regs *regs, | ||
1243 | u8 *data, int maxlen) | ||
1244 | { | ||
1245 | int i; | ||
1246 | u32 end = tp->size; | ||
1247 | u32 *dl; /* Data (relative) location */ | ||
1248 | |||
1249 | for (i = 0; i < tp->nr_args; i++) { | ||
1250 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
1251 | /* | ||
1252 | * First, we set the relative location and | ||
1253 | * maximum data length to *dl | ||
1254 | */ | ||
1255 | dl = (u32 *)(data + tp->args[i].offset); | ||
1256 | *dl = make_data_rloc(maxlen, end - tp->args[i].offset); | ||
1257 | /* Then try to fetch string or dynamic array data */ | ||
1258 | call_fetch(&tp->args[i].fetch, regs, dl); | ||
1259 | /* Reduce maximum length */ | ||
1260 | end += get_rloc_len(*dl); | ||
1261 | maxlen -= get_rloc_len(*dl); | ||
1262 | /* Trick here, convert data_rloc to data_loc */ | ||
1263 | *dl = convert_rloc_to_loc(*dl, | ||
1264 | ent_size + tp->args[i].offset); | ||
1265 | } else | ||
1266 | /* Just fetching data normally */ | ||
1267 | call_fetch(&tp->args[i].fetch, regs, | ||
1268 | data + tp->args[i].offset); | ||
1269 | } | ||
1270 | } | ||
1271 | |||
1046 | /* Kprobe handler */ | 1272 | /* Kprobe handler */ |
1047 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | 1273 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) |
1048 | { | 1274 | { |
@@ -1050,8 +1276,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1050 | struct kprobe_trace_entry_head *entry; | 1276 | struct kprobe_trace_entry_head *entry; |
1051 | struct ring_buffer_event *event; | 1277 | struct ring_buffer_event *event; |
1052 | struct ring_buffer *buffer; | 1278 | struct ring_buffer *buffer; |
1053 | u8 *data; | 1279 | int size, dsize, pc; |
1054 | int size, i, pc; | ||
1055 | unsigned long irq_flags; | 1280 | unsigned long irq_flags; |
1056 | struct ftrace_event_call *call = &tp->call; | 1281 | struct ftrace_event_call *call = &tp->call; |
1057 | 1282 | ||
@@ -1060,7 +1285,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1060 | local_save_flags(irq_flags); | 1285 | local_save_flags(irq_flags); |
1061 | pc = preempt_count(); | 1286 | pc = preempt_count(); |
1062 | 1287 | ||
1063 | size = sizeof(*entry) + tp->size; | 1288 | dsize = __get_data_size(tp, regs); |
1289 | size = sizeof(*entry) + tp->size + dsize; | ||
1064 | 1290 | ||
1065 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1291 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
1066 | size, irq_flags, pc); | 1292 | size, irq_flags, pc); |
@@ -1069,9 +1295,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
1069 | 1295 | ||
1070 | entry = ring_buffer_event_data(event); | 1296 | entry = ring_buffer_event_data(event); |
1071 | entry->ip = (unsigned long)kp->addr; | 1297 | entry->ip = (unsigned long)kp->addr; |
1072 | data = (u8 *)&entry[1]; | 1298 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1073 | for (i = 0; i < tp->nr_args; i++) | ||
1074 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1075 | 1299 | ||
1076 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1300 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1077 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1301 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
@@ -1085,15 +1309,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
1085 | struct kretprobe_trace_entry_head *entry; | 1309 | struct kretprobe_trace_entry_head *entry; |
1086 | struct ring_buffer_event *event; | 1310 | struct ring_buffer_event *event; |
1087 | struct ring_buffer *buffer; | 1311 | struct ring_buffer *buffer; |
1088 | u8 *data; | 1312 | int size, pc, dsize; |
1089 | int size, i, pc; | ||
1090 | unsigned long irq_flags; | 1313 | unsigned long irq_flags; |
1091 | struct ftrace_event_call *call = &tp->call; | 1314 | struct ftrace_event_call *call = &tp->call; |
1092 | 1315 | ||
1093 | local_save_flags(irq_flags); | 1316 | local_save_flags(irq_flags); |
1094 | pc = preempt_count(); | 1317 | pc = preempt_count(); |
1095 | 1318 | ||
1096 | size = sizeof(*entry) + tp->size; | 1319 | dsize = __get_data_size(tp, regs); |
1320 | size = sizeof(*entry) + tp->size + dsize; | ||
1097 | 1321 | ||
1098 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1322 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
1099 | size, irq_flags, pc); | 1323 | size, irq_flags, pc); |
@@ -1103,9 +1327,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
1103 | entry = ring_buffer_event_data(event); | 1327 | entry = ring_buffer_event_data(event); |
1104 | entry->func = (unsigned long)tp->rp.kp.addr; | 1328 | entry->func = (unsigned long)tp->rp.kp.addr; |
1105 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1329 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1106 | data = (u8 *)&entry[1]; | 1330 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1107 | for (i = 0; i < tp->nr_args; i++) | ||
1108 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1109 | 1331 | ||
1110 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1332 | if (!filter_current_check_discard(buffer, call, entry, event)) |
1111 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1333 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
@@ -1137,7 +1359,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, | |||
1137 | data = (u8 *)&field[1]; | 1359 | data = (u8 *)&field[1]; |
1138 | for (i = 0; i < tp->nr_args; i++) | 1360 | for (i = 0; i < tp->nr_args; i++) |
1139 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1361 | if (!tp->args[i].type->print(s, tp->args[i].name, |
1140 | data + tp->args[i].offset)) | 1362 | data + tp->args[i].offset, field)) |
1141 | goto partial; | 1363 | goto partial; |
1142 | 1364 | ||
1143 | if (!trace_seq_puts(s, "\n")) | 1365 | if (!trace_seq_puts(s, "\n")) |
@@ -1179,7 +1401,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, | |||
1179 | data = (u8 *)&field[1]; | 1401 | data = (u8 *)&field[1]; |
1180 | for (i = 0; i < tp->nr_args; i++) | 1402 | for (i = 0; i < tp->nr_args; i++) |
1181 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1403 | if (!tp->args[i].type->print(s, tp->args[i].name, |
1182 | data + tp->args[i].offset)) | 1404 | data + tp->args[i].offset, field)) |
1183 | goto partial; | 1405 | goto partial; |
1184 | 1406 | ||
1185 | if (!trace_seq_puts(s, "\n")) | 1407 | if (!trace_seq_puts(s, "\n")) |
@@ -1214,11 +1436,6 @@ static void probe_event_disable(struct ftrace_event_call *call) | |||
1214 | } | 1436 | } |
1215 | } | 1437 | } |
1216 | 1438 | ||
1217 | static int probe_event_raw_init(struct ftrace_event_call *event_call) | ||
1218 | { | ||
1219 | return 0; | ||
1220 | } | ||
1221 | |||
1222 | #undef DEFINE_FIELD | 1439 | #undef DEFINE_FIELD |
1223 | #define DEFINE_FIELD(type, item, name, is_signed) \ | 1440 | #define DEFINE_FIELD(type, item, name, is_signed) \ |
1224 | do { \ | 1441 | do { \ |
@@ -1239,7 +1456,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1239 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | 1456 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); |
1240 | /* Set argument names as fields */ | 1457 | /* Set argument names as fields */ |
1241 | for (i = 0; i < tp->nr_args; i++) { | 1458 | for (i = 0; i < tp->nr_args; i++) { |
1242 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1459 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
1243 | tp->args[i].name, | 1460 | tp->args[i].name, |
1244 | sizeof(field) + tp->args[i].offset, | 1461 | sizeof(field) + tp->args[i].offset, |
1245 | tp->args[i].type->size, | 1462 | tp->args[i].type->size, |
@@ -1261,7 +1478,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1261 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); | 1478 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); |
1262 | /* Set argument names as fields */ | 1479 | /* Set argument names as fields */ |
1263 | for (i = 0; i < tp->nr_args; i++) { | 1480 | for (i = 0; i < tp->nr_args; i++) { |
1264 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1481 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
1265 | tp->args[i].name, | 1482 | tp->args[i].name, |
1266 | sizeof(field) + tp->args[i].offset, | 1483 | sizeof(field) + tp->args[i].offset, |
1267 | tp->args[i].type->size, | 1484 | tp->args[i].type->size, |
@@ -1301,8 +1518,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | |||
1301 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | 1518 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
1302 | 1519 | ||
1303 | for (i = 0; i < tp->nr_args; i++) { | 1520 | for (i = 0; i < tp->nr_args; i++) { |
1304 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | 1521 | if (strcmp(tp->args[i].type->name, "string") == 0) |
1305 | tp->args[i].name); | 1522 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
1523 | ", __get_str(%s)", | ||
1524 | tp->args[i].name); | ||
1525 | else | ||
1526 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
1527 | tp->args[i].name); | ||
1306 | } | 1528 | } |
1307 | 1529 | ||
1308 | #undef LEN_OR_ZERO | 1530 | #undef LEN_OR_ZERO |
@@ -1339,11 +1561,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1339 | struct ftrace_event_call *call = &tp->call; | 1561 | struct ftrace_event_call *call = &tp->call; |
1340 | struct kprobe_trace_entry_head *entry; | 1562 | struct kprobe_trace_entry_head *entry; |
1341 | struct hlist_head *head; | 1563 | struct hlist_head *head; |
1342 | u8 *data; | 1564 | int size, __size, dsize; |
1343 | int size, __size, i; | ||
1344 | int rctx; | 1565 | int rctx; |
1345 | 1566 | ||
1346 | __size = sizeof(*entry) + tp->size; | 1567 | dsize = __get_data_size(tp, regs); |
1568 | __size = sizeof(*entry) + tp->size + dsize; | ||
1347 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1569 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1348 | size -= sizeof(u32); | 1570 | size -= sizeof(u32); |
1349 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1571 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
@@ -1355,9 +1577,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
1355 | return; | 1577 | return; |
1356 | 1578 | ||
1357 | entry->ip = (unsigned long)kp->addr; | 1579 | entry->ip = (unsigned long)kp->addr; |
1358 | data = (u8 *)&entry[1]; | 1580 | memset(&entry[1], 0, dsize); |
1359 | for (i = 0; i < tp->nr_args; i++) | 1581 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1360 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1361 | 1582 | ||
1362 | head = this_cpu_ptr(call->perf_events); | 1583 | head = this_cpu_ptr(call->perf_events); |
1363 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | 1584 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); |
@@ -1371,11 +1592,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1371 | struct ftrace_event_call *call = &tp->call; | 1592 | struct ftrace_event_call *call = &tp->call; |
1372 | struct kretprobe_trace_entry_head *entry; | 1593 | struct kretprobe_trace_entry_head *entry; |
1373 | struct hlist_head *head; | 1594 | struct hlist_head *head; |
1374 | u8 *data; | 1595 | int size, __size, dsize; |
1375 | int size, __size, i; | ||
1376 | int rctx; | 1596 | int rctx; |
1377 | 1597 | ||
1378 | __size = sizeof(*entry) + tp->size; | 1598 | dsize = __get_data_size(tp, regs); |
1599 | __size = sizeof(*entry) + tp->size + dsize; | ||
1379 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1600 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1380 | size -= sizeof(u32); | 1601 | size -= sizeof(u32); |
1381 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1602 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
@@ -1388,9 +1609,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
1388 | 1609 | ||
1389 | entry->func = (unsigned long)tp->rp.kp.addr; | 1610 | entry->func = (unsigned long)tp->rp.kp.addr; |
1390 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1611 | entry->ret_ip = (unsigned long)ri->ret_addr; |
1391 | data = (u8 *)&entry[1]; | 1612 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
1392 | for (i = 0; i < tp->nr_args; i++) | ||
1393 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
1394 | 1613 | ||
1395 | head = this_cpu_ptr(call->perf_events); | 1614 | head = this_cpu_ptr(call->perf_events); |
1396 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1615 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); |
@@ -1486,15 +1705,12 @@ static int register_probe_event(struct trace_probe *tp) | |||
1486 | int ret; | 1705 | int ret; |
1487 | 1706 | ||
1488 | /* Initialize ftrace_event_call */ | 1707 | /* Initialize ftrace_event_call */ |
1708 | INIT_LIST_HEAD(&call->class->fields); | ||
1489 | if (probe_is_return(tp)) { | 1709 | if (probe_is_return(tp)) { |
1490 | INIT_LIST_HEAD(&call->class->fields); | ||
1491 | call->event.funcs = &kretprobe_funcs; | 1710 | call->event.funcs = &kretprobe_funcs; |
1492 | call->class->raw_init = probe_event_raw_init; | ||
1493 | call->class->define_fields = kretprobe_event_define_fields; | 1711 | call->class->define_fields = kretprobe_event_define_fields; |
1494 | } else { | 1712 | } else { |
1495 | INIT_LIST_HEAD(&call->class->fields); | ||
1496 | call->event.funcs = &kprobe_funcs; | 1713 | call->event.funcs = &kprobe_funcs; |
1497 | call->class->raw_init = probe_event_raw_init; | ||
1498 | call->class->define_fields = kprobe_event_define_fields; | 1714 | call->class->define_fields = kprobe_event_define_fields; |
1499 | } | 1715 | } |
1500 | if (set_print_fmt(tp) < 0) | 1716 | if (set_print_fmt(tp) < 0) |
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b65..000000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null | |||
@@ -1,508 +0,0 @@ | |||
1 | /* | ||
2 | * trace_ksym.c - Kernel Symbol Tracer | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2009 | ||
19 | */ | ||
20 | |||
21 | #include <linux/kallsyms.h> | ||
22 | #include <linux/uaccess.h> | ||
23 | #include <linux/debugfs.h> | ||
24 | #include <linux/ftrace.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/slab.h> | ||
27 | #include <linux/fs.h> | ||
28 | |||
29 | #include "trace_output.h" | ||
30 | #include "trace.h" | ||
31 | |||
32 | #include <linux/hw_breakpoint.h> | ||
33 | #include <asm/hw_breakpoint.h> | ||
34 | |||
35 | #include <asm/atomic.h> | ||
36 | |||
37 | #define KSYM_TRACER_OP_LEN 3 /* rw- */ | ||
38 | |||
39 | struct trace_ksym { | ||
40 | struct perf_event **ksym_hbp; | ||
41 | struct perf_event_attr attr; | ||
42 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
43 | atomic64_t counter; | ||
44 | #endif | ||
45 | struct hlist_node ksym_hlist; | ||
46 | }; | ||
47 | |||
48 | static struct trace_array *ksym_trace_array; | ||
49 | |||
50 | static unsigned int ksym_tracing_enabled; | ||
51 | |||
52 | static HLIST_HEAD(ksym_filter_head); | ||
53 | |||
54 | static DEFINE_MUTEX(ksym_tracer_mutex); | ||
55 | |||
56 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
57 | |||
58 | #define MAX_UL_INT 0xffffffff | ||
59 | |||
60 | void ksym_collect_stats(unsigned long hbp_hit_addr) | ||
61 | { | ||
62 | struct hlist_node *node; | ||
63 | struct trace_ksym *entry; | ||
64 | |||
65 | rcu_read_lock(); | ||
66 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
67 | if (entry->attr.bp_addr == hbp_hit_addr) { | ||
68 | atomic64_inc(&entry->counter); | ||
69 | break; | ||
70 | } | ||
71 | } | ||
72 | rcu_read_unlock(); | ||
73 | } | ||
74 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
75 | |||
76 | void ksym_hbp_handler(struct perf_event *hbp, int nmi, | ||
77 | struct perf_sample_data *data, | ||
78 | struct pt_regs *regs) | ||
79 | { | ||
80 | struct ring_buffer_event *event; | ||
81 | struct ksym_trace_entry *entry; | ||
82 | struct ring_buffer *buffer; | ||
83 | int pc; | ||
84 | |||
85 | if (!ksym_tracing_enabled) | ||
86 | return; | ||
87 | |||
88 | buffer = ksym_trace_array->buffer; | ||
89 | |||
90 | pc = preempt_count(); | ||
91 | |||
92 | event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, | ||
93 | sizeof(*entry), 0, pc); | ||
94 | if (!event) | ||
95 | return; | ||
96 | |||
97 | entry = ring_buffer_event_data(event); | ||
98 | entry->ip = instruction_pointer(regs); | ||
99 | entry->type = hw_breakpoint_type(hbp); | ||
100 | entry->addr = hw_breakpoint_addr(hbp); | ||
101 | strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); | ||
102 | |||
103 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
104 | ksym_collect_stats(hw_breakpoint_addr(hbp)); | ||
105 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
106 | |||
107 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
108 | } | ||
109 | |||
110 | /* Valid access types are represented as | ||
111 | * | ||
112 | * rw- : Set Read/Write Access Breakpoint | ||
113 | * -w- : Set Write Access Breakpoint | ||
114 | * --- : Clear Breakpoints | ||
115 | * --x : Set Execution Break points (Not available yet) | ||
116 | * | ||
117 | */ | ||
118 | static int ksym_trace_get_access_type(char *str) | ||
119 | { | ||
120 | int access = 0; | ||
121 | |||
122 | if (str[0] == 'r') | ||
123 | access |= HW_BREAKPOINT_R; | ||
124 | |||
125 | if (str[1] == 'w') | ||
126 | access |= HW_BREAKPOINT_W; | ||
127 | |||
128 | if (str[2] == 'x') | ||
129 | access |= HW_BREAKPOINT_X; | ||
130 | |||
131 | switch (access) { | ||
132 | case HW_BREAKPOINT_R: | ||
133 | case HW_BREAKPOINT_W: | ||
134 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
135 | return access; | ||
136 | default: | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * There can be several possible malformed requests and we attempt to capture | ||
143 | * all of them. We enumerate some of the rules | ||
144 | * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. | ||
145 | * i.e. multiple ':' symbols disallowed. Possible uses are of the form | ||
146 | * <module>:<ksym_name>:<op>. | ||
147 | * 2. No delimiter symbol ':' in the input string | ||
148 | * 3. Spurious operator symbols or symbols not in their respective positions | ||
149 | * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file | ||
150 | * 5. Kernel symbol not a part of /proc/kallsyms | ||
151 | * 6. Duplicate requests | ||
152 | */ | ||
153 | static int parse_ksym_trace_str(char *input_string, char **ksymname, | ||
154 | unsigned long *addr) | ||
155 | { | ||
156 | int ret; | ||
157 | |||
158 | *ksymname = strsep(&input_string, ":"); | ||
159 | *addr = kallsyms_lookup_name(*ksymname); | ||
160 | |||
161 | /* Check for malformed request: (2), (1) and (5) */ | ||
162 | if ((!input_string) || | ||
163 | (strlen(input_string) != KSYM_TRACER_OP_LEN) || | ||
164 | (*addr == 0)) | ||
165 | return -EINVAL;; | ||
166 | |||
167 | ret = ksym_trace_get_access_type(input_string); | ||
168 | |||
169 | return ret; | ||
170 | } | ||
171 | |||
172 | int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) | ||
173 | { | ||
174 | struct trace_ksym *entry; | ||
175 | int ret = -ENOMEM; | ||
176 | |||
177 | entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); | ||
178 | if (!entry) | ||
179 | return -ENOMEM; | ||
180 | |||
181 | hw_breakpoint_init(&entry->attr); | ||
182 | |||
183 | entry->attr.bp_type = op; | ||
184 | entry->attr.bp_addr = addr; | ||
185 | entry->attr.bp_len = HW_BREAKPOINT_LEN_4; | ||
186 | |||
187 | entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, | ||
188 | ksym_hbp_handler); | ||
189 | |||
190 | if (IS_ERR(entry->ksym_hbp)) { | ||
191 | ret = PTR_ERR(entry->ksym_hbp); | ||
192 | if (ret == -ENOSPC) { | ||
193 | printk(KERN_ERR "ksym_tracer: Maximum limit reached." | ||
194 | " No new requests for tracing can be accepted now.\n"); | ||
195 | } else { | ||
196 | printk(KERN_INFO "ksym_tracer request failed. Try again" | ||
197 | " later!!\n"); | ||
198 | } | ||
199 | goto err; | ||
200 | } | ||
201 | |||
202 | hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); | ||
203 | |||
204 | return 0; | ||
205 | |||
206 | err: | ||
207 | kfree(entry); | ||
208 | |||
209 | return ret; | ||
210 | } | ||
211 | |||
212 | static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | struct trace_ksym *entry; | ||
216 | struct hlist_node *node; | ||
217 | struct trace_seq *s; | ||
218 | ssize_t cnt = 0; | ||
219 | int ret; | ||
220 | |||
221 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
222 | if (!s) | ||
223 | return -ENOMEM; | ||
224 | trace_seq_init(s); | ||
225 | |||
226 | mutex_lock(&ksym_tracer_mutex); | ||
227 | |||
228 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
229 | ret = trace_seq_printf(s, "%pS:", | ||
230 | (void *)(unsigned long)entry->attr.bp_addr); | ||
231 | if (entry->attr.bp_type == HW_BREAKPOINT_R) | ||
232 | ret = trace_seq_puts(s, "r--\n"); | ||
233 | else if (entry->attr.bp_type == HW_BREAKPOINT_W) | ||
234 | ret = trace_seq_puts(s, "-w-\n"); | ||
235 | else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) | ||
236 | ret = trace_seq_puts(s, "rw-\n"); | ||
237 | WARN_ON_ONCE(!ret); | ||
238 | } | ||
239 | |||
240 | cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | ||
241 | |||
242 | mutex_unlock(&ksym_tracer_mutex); | ||
243 | |||
244 | kfree(s); | ||
245 | |||
246 | return cnt; | ||
247 | } | ||
248 | |||
249 | static void __ksym_trace_reset(void) | ||
250 | { | ||
251 | struct trace_ksym *entry; | ||
252 | struct hlist_node *node, *node1; | ||
253 | |||
254 | mutex_lock(&ksym_tracer_mutex); | ||
255 | hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, | ||
256 | ksym_hlist) { | ||
257 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
258 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
259 | synchronize_rcu(); | ||
260 | kfree(entry); | ||
261 | } | ||
262 | mutex_unlock(&ksym_tracer_mutex); | ||
263 | } | ||
264 | |||
265 | static ssize_t ksym_trace_filter_write(struct file *file, | ||
266 | const char __user *buffer, | ||
267 | size_t count, loff_t *ppos) | ||
268 | { | ||
269 | struct trace_ksym *entry; | ||
270 | struct hlist_node *node; | ||
271 | char *buf, *input_string, *ksymname = NULL; | ||
272 | unsigned long ksym_addr = 0; | ||
273 | int ret, op, changed = 0; | ||
274 | |||
275 | buf = kzalloc(count + 1, GFP_KERNEL); | ||
276 | if (!buf) | ||
277 | return -ENOMEM; | ||
278 | |||
279 | ret = -EFAULT; | ||
280 | if (copy_from_user(buf, buffer, count)) | ||
281 | goto out; | ||
282 | |||
283 | buf[count] = '\0'; | ||
284 | input_string = strstrip(buf); | ||
285 | |||
286 | /* | ||
287 | * Clear all breakpoints if: | ||
288 | * 1: echo > ksym_trace_filter | ||
289 | * 2: echo 0 > ksym_trace_filter | ||
290 | * 3: echo "*:---" > ksym_trace_filter | ||
291 | */ | ||
292 | if (!input_string[0] || !strcmp(input_string, "0") || | ||
293 | !strcmp(input_string, "*:---")) { | ||
294 | __ksym_trace_reset(); | ||
295 | ret = 0; | ||
296 | goto out; | ||
297 | } | ||
298 | |||
299 | ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); | ||
300 | if (ret < 0) | ||
301 | goto out; | ||
302 | |||
303 | mutex_lock(&ksym_tracer_mutex); | ||
304 | |||
305 | ret = -EINVAL; | ||
306 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
307 | if (entry->attr.bp_addr == ksym_addr) { | ||
308 | /* Check for malformed request: (6) */ | ||
309 | if (entry->attr.bp_type != op) | ||
310 | changed = 1; | ||
311 | else | ||
312 | goto out_unlock; | ||
313 | break; | ||
314 | } | ||
315 | } | ||
316 | if (changed) { | ||
317 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
318 | entry->attr.bp_type = op; | ||
319 | ret = 0; | ||
320 | if (op > 0) { | ||
321 | entry->ksym_hbp = | ||
322 | register_wide_hw_breakpoint(&entry->attr, | ||
323 | ksym_hbp_handler); | ||
324 | if (IS_ERR(entry->ksym_hbp)) | ||
325 | ret = PTR_ERR(entry->ksym_hbp); | ||
326 | else | ||
327 | goto out_unlock; | ||
328 | } | ||
329 | /* Error or "symbol:---" case: drop it */ | ||
330 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
331 | synchronize_rcu(); | ||
332 | kfree(entry); | ||
333 | goto out_unlock; | ||
334 | } else { | ||
335 | /* Check for malformed request: (4) */ | ||
336 | if (op) | ||
337 | ret = process_new_ksym_entry(ksymname, op, ksym_addr); | ||
338 | } | ||
339 | out_unlock: | ||
340 | mutex_unlock(&ksym_tracer_mutex); | ||
341 | out: | ||
342 | kfree(buf); | ||
343 | return !ret ? count : ret; | ||
344 | } | ||
345 | |||
346 | static const struct file_operations ksym_tracing_fops = { | ||
347 | .open = tracing_open_generic, | ||
348 | .read = ksym_trace_filter_read, | ||
349 | .write = ksym_trace_filter_write, | ||
350 | }; | ||
351 | |||
352 | static void ksym_trace_reset(struct trace_array *tr) | ||
353 | { | ||
354 | ksym_tracing_enabled = 0; | ||
355 | __ksym_trace_reset(); | ||
356 | } | ||
357 | |||
358 | static int ksym_trace_init(struct trace_array *tr) | ||
359 | { | ||
360 | int cpu, ret = 0; | ||
361 | |||
362 | for_each_online_cpu(cpu) | ||
363 | tracing_reset(tr, cpu); | ||
364 | ksym_tracing_enabled = 1; | ||
365 | ksym_trace_array = tr; | ||
366 | |||
367 | return ret; | ||
368 | } | ||
369 | |||
370 | static void ksym_trace_print_header(struct seq_file *m) | ||
371 | { | ||
372 | seq_puts(m, | ||
373 | "# TASK-PID CPU# Symbol " | ||
374 | "Type Function\n"); | ||
375 | seq_puts(m, | ||
376 | "# | | | " | ||
377 | " | |\n"); | ||
378 | } | ||
379 | |||
380 | static enum print_line_t ksym_trace_output(struct trace_iterator *iter) | ||
381 | { | ||
382 | struct trace_entry *entry = iter->ent; | ||
383 | struct trace_seq *s = &iter->seq; | ||
384 | struct ksym_trace_entry *field; | ||
385 | char str[KSYM_SYMBOL_LEN]; | ||
386 | int ret; | ||
387 | |||
388 | if (entry->type != TRACE_KSYM) | ||
389 | return TRACE_TYPE_UNHANDLED; | ||
390 | |||
391 | trace_assign_type(field, entry); | ||
392 | |||
393 | ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, | ||
394 | entry->pid, iter->cpu, (char *)field->addr); | ||
395 | if (!ret) | ||
396 | return TRACE_TYPE_PARTIAL_LINE; | ||
397 | |||
398 | switch (field->type) { | ||
399 | case HW_BREAKPOINT_R: | ||
400 | ret = trace_seq_printf(s, " R "); | ||
401 | break; | ||
402 | case HW_BREAKPOINT_W: | ||
403 | ret = trace_seq_printf(s, " W "); | ||
404 | break; | ||
405 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
406 | ret = trace_seq_printf(s, " RW "); | ||
407 | break; | ||
408 | default: | ||
409 | return TRACE_TYPE_PARTIAL_LINE; | ||
410 | } | ||
411 | |||
412 | if (!ret) | ||
413 | return TRACE_TYPE_PARTIAL_LINE; | ||
414 | |||
415 | sprint_symbol(str, field->ip); | ||
416 | ret = trace_seq_printf(s, "%s\n", str); | ||
417 | if (!ret) | ||
418 | return TRACE_TYPE_PARTIAL_LINE; | ||
419 | |||
420 | return TRACE_TYPE_HANDLED; | ||
421 | } | ||
422 | |||
423 | struct tracer ksym_tracer __read_mostly = | ||
424 | { | ||
425 | .name = "ksym_tracer", | ||
426 | .init = ksym_trace_init, | ||
427 | .reset = ksym_trace_reset, | ||
428 | #ifdef CONFIG_FTRACE_SELFTEST | ||
429 | .selftest = trace_selftest_startup_ksym, | ||
430 | #endif | ||
431 | .print_header = ksym_trace_print_header, | ||
432 | .print_line = ksym_trace_output | ||
433 | }; | ||
434 | |||
435 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
436 | static int ksym_profile_show(struct seq_file *m, void *v) | ||
437 | { | ||
438 | struct hlist_node *node; | ||
439 | struct trace_ksym *entry; | ||
440 | int access_type = 0; | ||
441 | char fn_name[KSYM_NAME_LEN]; | ||
442 | |||
443 | seq_puts(m, " Access Type "); | ||
444 | seq_puts(m, " Symbol Counter\n"); | ||
445 | seq_puts(m, " ----------- "); | ||
446 | seq_puts(m, " ------ -------\n"); | ||
447 | |||
448 | rcu_read_lock(); | ||
449 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
450 | |||
451 | access_type = entry->attr.bp_type; | ||
452 | |||
453 | switch (access_type) { | ||
454 | case HW_BREAKPOINT_R: | ||
455 | seq_puts(m, " R "); | ||
456 | break; | ||
457 | case HW_BREAKPOINT_W: | ||
458 | seq_puts(m, " W "); | ||
459 | break; | ||
460 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
461 | seq_puts(m, " RW "); | ||
462 | break; | ||
463 | default: | ||
464 | seq_puts(m, " NA "); | ||
465 | } | ||
466 | |||
467 | if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) | ||
468 | seq_printf(m, " %-36s", fn_name); | ||
469 | else | ||
470 | seq_printf(m, " %-36s", "<NA>"); | ||
471 | seq_printf(m, " %15llu\n", | ||
472 | (unsigned long long)atomic64_read(&entry->counter)); | ||
473 | } | ||
474 | rcu_read_unlock(); | ||
475 | |||
476 | return 0; | ||
477 | } | ||
478 | |||
479 | static int ksym_profile_open(struct inode *node, struct file *file) | ||
480 | { | ||
481 | return single_open(file, ksym_profile_show, NULL); | ||
482 | } | ||
483 | |||
484 | static const struct file_operations ksym_profile_fops = { | ||
485 | .open = ksym_profile_open, | ||
486 | .read = seq_read, | ||
487 | .llseek = seq_lseek, | ||
488 | .release = single_release, | ||
489 | }; | ||
490 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
491 | |||
492 | __init static int init_ksym_trace(void) | ||
493 | { | ||
494 | struct dentry *d_tracer; | ||
495 | |||
496 | d_tracer = tracing_init_dentry(); | ||
497 | |||
498 | trace_create_file("ksym_trace_filter", 0644, d_tracer, | ||
499 | NULL, &ksym_tracing_fops); | ||
500 | |||
501 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
502 | trace_create_file("ksym_profile", 0444, d_tracer, | ||
503 | NULL, &ksym_profile_fops); | ||
504 | #endif | ||
505 | |||
506 | return register_tracer(&ksym_tracer); | ||
507 | } | ||
508 | device_initcall(init_ksym_trace); | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..02272baa2206 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -16,9 +16,6 @@ | |||
16 | 16 | ||
17 | DECLARE_RWSEM(trace_event_mutex); | 17 | DECLARE_RWSEM(trace_event_mutex); |
18 | 18 | ||
19 | DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); | ||
20 | EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); | ||
21 | |||
22 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; | 19 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; |
23 | 20 | ||
24 | static int next_event_type = __TRACE_LAST_TYPE + 1; | 21 | static int next_event_type = __TRACE_LAST_TYPE + 1; |
@@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { | |||
1069 | .funcs = &trace_wake_funcs, | 1066 | .funcs = &trace_wake_funcs, |
1070 | }; | 1067 | }; |
1071 | 1068 | ||
1072 | /* TRACE_SPECIAL */ | ||
1073 | static enum print_line_t trace_special_print(struct trace_iterator *iter, | ||
1074 | int flags, struct trace_event *event) | ||
1075 | { | ||
1076 | struct special_entry *field; | ||
1077 | |||
1078 | trace_assign_type(field, iter->ent); | ||
1079 | |||
1080 | if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", | ||
1081 | field->arg1, | ||
1082 | field->arg2, | ||
1083 | field->arg3)) | ||
1084 | return TRACE_TYPE_PARTIAL_LINE; | ||
1085 | |||
1086 | return TRACE_TYPE_HANDLED; | ||
1087 | } | ||
1088 | |||
1089 | static enum print_line_t trace_special_hex(struct trace_iterator *iter, | ||
1090 | int flags, struct trace_event *event) | ||
1091 | { | ||
1092 | struct special_entry *field; | ||
1093 | struct trace_seq *s = &iter->seq; | ||
1094 | |||
1095 | trace_assign_type(field, iter->ent); | ||
1096 | |||
1097 | SEQ_PUT_HEX_FIELD_RET(s, field->arg1); | ||
1098 | SEQ_PUT_HEX_FIELD_RET(s, field->arg2); | ||
1099 | SEQ_PUT_HEX_FIELD_RET(s, field->arg3); | ||
1100 | |||
1101 | return TRACE_TYPE_HANDLED; | ||
1102 | } | ||
1103 | |||
1104 | static enum print_line_t trace_special_bin(struct trace_iterator *iter, | ||
1105 | int flags, struct trace_event *event) | ||
1106 | { | ||
1107 | struct special_entry *field; | ||
1108 | struct trace_seq *s = &iter->seq; | ||
1109 | |||
1110 | trace_assign_type(field, iter->ent); | ||
1111 | |||
1112 | SEQ_PUT_FIELD_RET(s, field->arg1); | ||
1113 | SEQ_PUT_FIELD_RET(s, field->arg2); | ||
1114 | SEQ_PUT_FIELD_RET(s, field->arg3); | ||
1115 | |||
1116 | return TRACE_TYPE_HANDLED; | ||
1117 | } | ||
1118 | |||
1119 | static struct trace_event_functions trace_special_funcs = { | ||
1120 | .trace = trace_special_print, | ||
1121 | .raw = trace_special_print, | ||
1122 | .hex = trace_special_hex, | ||
1123 | .binary = trace_special_bin, | ||
1124 | }; | ||
1125 | |||
1126 | static struct trace_event trace_special_event = { | ||
1127 | .type = TRACE_SPECIAL, | ||
1128 | .funcs = &trace_special_funcs, | ||
1129 | }; | ||
1130 | |||
1131 | /* TRACE_STACK */ | 1069 | /* TRACE_STACK */ |
1132 | 1070 | ||
1133 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, | 1071 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, |
@@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
1161 | 1099 | ||
1162 | static struct trace_event_functions trace_stack_funcs = { | 1100 | static struct trace_event_functions trace_stack_funcs = { |
1163 | .trace = trace_stack_print, | 1101 | .trace = trace_stack_print, |
1164 | .raw = trace_special_print, | ||
1165 | .hex = trace_special_hex, | ||
1166 | .binary = trace_special_bin, | ||
1167 | }; | 1102 | }; |
1168 | 1103 | ||
1169 | static struct trace_event trace_stack_event = { | 1104 | static struct trace_event trace_stack_event = { |
@@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, | |||
1194 | 1129 | ||
1195 | static struct trace_event_functions trace_user_stack_funcs = { | 1130 | static struct trace_event_functions trace_user_stack_funcs = { |
1196 | .trace = trace_user_stack_print, | 1131 | .trace = trace_user_stack_print, |
1197 | .raw = trace_special_print, | ||
1198 | .hex = trace_special_hex, | ||
1199 | .binary = trace_special_bin, | ||
1200 | }; | 1132 | }; |
1201 | 1133 | ||
1202 | static struct trace_event trace_user_stack_event = { | 1134 | static struct trace_event trace_user_stack_event = { |
@@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { | |||
1314 | &trace_fn_event, | 1246 | &trace_fn_event, |
1315 | &trace_ctx_event, | 1247 | &trace_ctx_event, |
1316 | &trace_wake_event, | 1248 | &trace_wake_event, |
1317 | &trace_special_event, | ||
1318 | &trace_stack_event, | 1249 | &trace_stack_event, |
1319 | &trace_user_stack_event, | 1250 | &trace_user_stack_event, |
1320 | &trace_bprint_event, | 1251 | &trace_bprint_event, |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..4086eae6e81b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
46 | struct trace_array_cpu *data; | 46 | struct trace_array_cpu *data; |
47 | unsigned long flags; | 47 | unsigned long flags; |
48 | long disabled; | 48 | long disabled; |
49 | int resched; | ||
50 | int cpu; | 49 | int cpu; |
51 | int pc; | 50 | int pc; |
52 | 51 | ||
@@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
54 | return; | 53 | return; |
55 | 54 | ||
56 | pc = preempt_count(); | 55 | pc = preempt_count(); |
57 | resched = ftrace_preempt_disable(); | 56 | preempt_disable_notrace(); |
58 | 57 | ||
59 | cpu = raw_smp_processor_id(); | 58 | cpu = raw_smp_processor_id(); |
60 | if (cpu != wakeup_current_cpu) | 59 | if (cpu != wakeup_current_cpu) |
@@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
74 | out: | 73 | out: |
75 | atomic_dec(&data->disabled); | 74 | atomic_dec(&data->disabled); |
76 | out_enable: | 75 | out_enable: |
77 | ftrace_preempt_enable(resched); | 76 | preempt_enable_notrace(); |
78 | } | 77 | } |
79 | 78 | ||
80 | static struct ftrace_ops trace_ops __read_mostly = | 79 | static struct ftrace_ops trace_ops __read_mostly = |
@@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
383 | #ifdef CONFIG_FTRACE_SELFTEST | 382 | #ifdef CONFIG_FTRACE_SELFTEST |
384 | .selftest = trace_selftest_startup_wakeup, | 383 | .selftest = trace_selftest_startup_wakeup, |
385 | #endif | 384 | #endif |
385 | .use_max_tr = 1, | ||
386 | }; | 386 | }; |
387 | 387 | ||
388 | static struct tracer wakeup_rt_tracer __read_mostly = | 388 | static struct tracer wakeup_rt_tracer __read_mostly = |
@@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
397 | #ifdef CONFIG_FTRACE_SELFTEST | 397 | #ifdef CONFIG_FTRACE_SELFTEST |
398 | .selftest = trace_selftest_startup_wakeup, | 398 | .selftest = trace_selftest_startup_wakeup, |
399 | #endif | 399 | #endif |
400 | .use_max_tr = 1, | ||
400 | }; | 401 | }; |
401 | 402 | ||
402 | __init static int init_wakeup_tracer(void) | 403 | __init static int init_wakeup_tracer(void) |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f0..155a415b3209 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) | |||
13 | case TRACE_WAKE: | 13 | case TRACE_WAKE: |
14 | case TRACE_STACK: | 14 | case TRACE_STACK: |
15 | case TRACE_PRINT: | 15 | case TRACE_PRINT: |
16 | case TRACE_SPECIAL: | ||
17 | case TRACE_BRANCH: | 16 | case TRACE_BRANCH: |
18 | case TRACE_GRAPH_ENT: | 17 | case TRACE_GRAPH_ENT: |
19 | case TRACE_GRAPH_RET: | 18 | case TRACE_GRAPH_RET: |
20 | case TRACE_KSYM: | ||
21 | return 1; | 19 | return 1; |
22 | } | 20 | } |
23 | return 0; | 21 | return 0; |
@@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr | |||
691 | } | 689 | } |
692 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ | 690 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ |
693 | 691 | ||
694 | #ifdef CONFIG_SYSPROF_TRACER | ||
695 | int | ||
696 | trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) | ||
697 | { | ||
698 | unsigned long count; | ||
699 | int ret; | ||
700 | |||
701 | /* start the tracing */ | ||
702 | ret = tracer_init(trace, tr); | ||
703 | if (ret) { | ||
704 | warn_failed_init_tracer(trace, ret); | ||
705 | return ret; | ||
706 | } | ||
707 | |||
708 | /* Sleep for a 1/10 of a second */ | ||
709 | msleep(100); | ||
710 | /* stop the tracing. */ | ||
711 | tracing_stop(); | ||
712 | /* check the trace buffer */ | ||
713 | ret = trace_test_buffer(tr, &count); | ||
714 | trace->reset(tr); | ||
715 | tracing_start(); | ||
716 | |||
717 | if (!ret && !count) { | ||
718 | printk(KERN_CONT ".. no entries found .."); | ||
719 | ret = -1; | ||
720 | } | ||
721 | |||
722 | return ret; | ||
723 | } | ||
724 | #endif /* CONFIG_SYSPROF_TRACER */ | ||
725 | |||
726 | #ifdef CONFIG_BRANCH_TRACER | 692 | #ifdef CONFIG_BRANCH_TRACER |
727 | int | 693 | int |
728 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | 694 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) |
@@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | |||
755 | } | 721 | } |
756 | #endif /* CONFIG_BRANCH_TRACER */ | 722 | #endif /* CONFIG_BRANCH_TRACER */ |
757 | 723 | ||
758 | #ifdef CONFIG_KSYM_TRACER | ||
759 | static int ksym_selftest_dummy; | ||
760 | |||
761 | int | ||
762 | trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) | ||
763 | { | ||
764 | unsigned long count; | ||
765 | int ret; | ||
766 | |||
767 | /* start the tracing */ | ||
768 | ret = tracer_init(trace, tr); | ||
769 | if (ret) { | ||
770 | warn_failed_init_tracer(trace, ret); | ||
771 | return ret; | ||
772 | } | ||
773 | |||
774 | ksym_selftest_dummy = 0; | ||
775 | /* Register the read-write tracing request */ | ||
776 | |||
777 | ret = process_new_ksym_entry("ksym_selftest_dummy", | ||
778 | HW_BREAKPOINT_R | HW_BREAKPOINT_W, | ||
779 | (unsigned long)(&ksym_selftest_dummy)); | ||
780 | |||
781 | if (ret < 0) { | ||
782 | printk(KERN_CONT "ksym_trace read-write startup test failed\n"); | ||
783 | goto ret_path; | ||
784 | } | ||
785 | /* Perform a read and a write operation over the dummy variable to | ||
786 | * trigger the tracer | ||
787 | */ | ||
788 | if (ksym_selftest_dummy == 0) | ||
789 | ksym_selftest_dummy++; | ||
790 | |||
791 | /* stop the tracing. */ | ||
792 | tracing_stop(); | ||
793 | /* check the trace buffer */ | ||
794 | ret = trace_test_buffer(tr, &count); | ||
795 | trace->reset(tr); | ||
796 | tracing_start(); | ||
797 | |||
798 | /* read & write operations - one each is performed on the dummy variable | ||
799 | * triggering two entries in the trace buffer | ||
800 | */ | ||
801 | if (!ret && count != 2) { | ||
802 | printk(KERN_CONT "Ksym tracer startup test failed"); | ||
803 | ret = -1; | ||
804 | } | ||
805 | |||
806 | ret_path: | ||
807 | return ret; | ||
808 | } | ||
809 | #endif /* CONFIG_KSYM_TRACER */ | ||
810 | |||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..a6b7e0e0f3eb 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -110,12 +110,12 @@ static inline void check_stack(void) | |||
110 | static void | 110 | static void |
111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) |
112 | { | 112 | { |
113 | int cpu, resched; | 113 | int cpu; |
114 | 114 | ||
115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | 115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) |
116 | return; | 116 | return; |
117 | 117 | ||
118 | resched = ftrace_preempt_disable(); | 118 | preempt_disable_notrace(); |
119 | 119 | ||
120 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
121 | /* no atomic needed, we only modify this variable by this cpu */ | 121 | /* no atomic needed, we only modify this variable by this cpu */ |
@@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
127 | out: | 127 | out: |
128 | per_cpu(trace_active, cpu)--; | 128 | per_cpu(trace_active, cpu)--; |
129 | /* prevent recursion in schedule */ | 129 | /* prevent recursion in schedule */ |
130 | ftrace_preempt_enable(resched); | 130 | preempt_enable_notrace(); |
131 | } | 131 | } |
132 | 132 | ||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
@@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i) | |||
249 | { | 249 | { |
250 | unsigned long addr = stack_dump_trace[i]; | 250 | unsigned long addr = stack_dump_trace[i]; |
251 | 251 | ||
252 | return seq_printf(m, "%pF\n", (void *)addr); | 252 | return seq_printf(m, "%pS\n", (void *)addr); |
253 | } | 253 | } |
254 | 254 | ||
255 | static void print_disabled(struct seq_file *m) | 255 | static void print_disabled(struct seq_file *m) |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304b..bac752f0cfb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
25 | 25 | ||
26 | /* All syscall exit events have the same fields */ | ||
27 | static LIST_HEAD(syscall_exit_fields); | ||
28 | |||
26 | static struct list_head * | 29 | static struct list_head * |
27 | syscall_get_enter_fields(struct ftrace_event_call *call) | 30 | syscall_get_enter_fields(struct ftrace_event_call *call) |
28 | { | 31 | { |
@@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
34 | static struct list_head * | 37 | static struct list_head * |
35 | syscall_get_exit_fields(struct ftrace_event_call *call) | 38 | syscall_get_exit_fields(struct ftrace_event_call *call) |
36 | { | 39 | { |
37 | struct syscall_metadata *entry = call->data; | 40 | return &syscall_exit_fields; |
38 | |||
39 | return &entry->exit_fields; | ||
40 | } | 41 | } |
41 | 42 | ||
42 | struct trace_event_functions enter_syscall_print_funcs = { | 43 | struct trace_event_functions enter_syscall_print_funcs = { |
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a552ca9..000000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null | |||
@@ -1,329 +0,0 @@ | |||
1 | /* | ||
2 | * trace stack traces | ||
3 | * | ||
4 | * Copyright (C) 2004-2008, Soeren Sandmann | ||
5 | * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> | ||
6 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | ||
7 | */ | ||
8 | #include <linux/kallsyms.h> | ||
9 | #include <linux/debugfs.h> | ||
10 | #include <linux/hrtimer.h> | ||
11 | #include <linux/uaccess.h> | ||
12 | #include <linux/ftrace.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/irq.h> | ||
15 | #include <linux/fs.h> | ||
16 | |||
17 | #include <asm/stacktrace.h> | ||
18 | |||
19 | #include "trace.h" | ||
20 | |||
21 | static struct trace_array *sysprof_trace; | ||
22 | static int __read_mostly tracer_enabled; | ||
23 | |||
24 | /* | ||
25 | * 1 msec sample interval by default: | ||
26 | */ | ||
27 | static unsigned long sample_period = 1000000; | ||
28 | static const unsigned int sample_max_depth = 512; | ||
29 | |||
30 | static DEFINE_MUTEX(sample_timer_lock); | ||
31 | /* | ||
32 | * Per CPU hrtimers that do the profiling: | ||
33 | */ | ||
34 | static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); | ||
35 | |||
36 | struct stack_frame { | ||
37 | const void __user *next_fp; | ||
38 | unsigned long return_address; | ||
39 | }; | ||
40 | |||
41 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | ||
42 | { | ||
43 | int ret; | ||
44 | |||
45 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
46 | return 0; | ||
47 | |||
48 | ret = 1; | ||
49 | pagefault_disable(); | ||
50 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
51 | ret = 0; | ||
52 | pagefault_enable(); | ||
53 | |||
54 | return ret; | ||
55 | } | ||
56 | |||
57 | struct backtrace_info { | ||
58 | struct trace_array_cpu *data; | ||
59 | struct trace_array *tr; | ||
60 | int pos; | ||
61 | }; | ||
62 | |||
63 | static void | ||
64 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
65 | { | ||
66 | /* Ignore warnings */ | ||
67 | } | ||
68 | |||
69 | static void backtrace_warning(void *data, char *msg) | ||
70 | { | ||
71 | /* Ignore warnings */ | ||
72 | } | ||
73 | |||
74 | static int backtrace_stack(void *data, char *name) | ||
75 | { | ||
76 | /* Don't bother with IRQ stacks for now */ | ||
77 | return -1; | ||
78 | } | ||
79 | |||
80 | static void backtrace_address(void *data, unsigned long addr, int reliable) | ||
81 | { | ||
82 | struct backtrace_info *info = data; | ||
83 | |||
84 | if (info->pos < sample_max_depth && reliable) { | ||
85 | __trace_special(info->tr, info->data, 1, addr, 0); | ||
86 | |||
87 | info->pos++; | ||
88 | } | ||
89 | } | ||
90 | |||
91 | static const struct stacktrace_ops backtrace_ops = { | ||
92 | .warning = backtrace_warning, | ||
93 | .warning_symbol = backtrace_warning_symbol, | ||
94 | .stack = backtrace_stack, | ||
95 | .address = backtrace_address, | ||
96 | .walk_stack = print_context_stack, | ||
97 | }; | ||
98 | |||
99 | static int | ||
100 | trace_kernel(struct pt_regs *regs, struct trace_array *tr, | ||
101 | struct trace_array_cpu *data) | ||
102 | { | ||
103 | struct backtrace_info info; | ||
104 | unsigned long bp; | ||
105 | char *stack; | ||
106 | |||
107 | info.tr = tr; | ||
108 | info.data = data; | ||
109 | info.pos = 1; | ||
110 | |||
111 | __trace_special(info.tr, info.data, 1, regs->ip, 0); | ||
112 | |||
113 | stack = ((char *)regs + sizeof(struct pt_regs)); | ||
114 | #ifdef CONFIG_FRAME_POINTER | ||
115 | bp = regs->bp; | ||
116 | #else | ||
117 | bp = 0; | ||
118 | #endif | ||
119 | |||
120 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); | ||
121 | |||
122 | return info.pos; | ||
123 | } | ||
124 | |||
125 | static void timer_notify(struct pt_regs *regs, int cpu) | ||
126 | { | ||
127 | struct trace_array_cpu *data; | ||
128 | struct stack_frame frame; | ||
129 | struct trace_array *tr; | ||
130 | const void __user *fp; | ||
131 | int is_user; | ||
132 | int i; | ||
133 | |||
134 | if (!regs) | ||
135 | return; | ||
136 | |||
137 | tr = sysprof_trace; | ||
138 | data = tr->data[cpu]; | ||
139 | is_user = user_mode(regs); | ||
140 | |||
141 | if (!current || current->pid == 0) | ||
142 | return; | ||
143 | |||
144 | if (is_user && current->state != TASK_RUNNING) | ||
145 | return; | ||
146 | |||
147 | __trace_special(tr, data, 0, 0, current->pid); | ||
148 | |||
149 | if (!is_user) | ||
150 | i = trace_kernel(regs, tr, data); | ||
151 | else | ||
152 | i = 0; | ||
153 | |||
154 | /* | ||
155 | * Trace user stack if we are not a kernel thread | ||
156 | */ | ||
157 | if (current->mm && i < sample_max_depth) { | ||
158 | regs = (struct pt_regs *)current->thread.sp0 - 1; | ||
159 | |||
160 | fp = (void __user *)regs->bp; | ||
161 | |||
162 | __trace_special(tr, data, 2, regs->ip, 0); | ||
163 | |||
164 | while (i < sample_max_depth) { | ||
165 | frame.next_fp = NULL; | ||
166 | frame.return_address = 0; | ||
167 | if (!copy_stack_frame(fp, &frame)) | ||
168 | break; | ||
169 | if ((unsigned long)fp < regs->sp) | ||
170 | break; | ||
171 | |||
172 | __trace_special(tr, data, 2, frame.return_address, | ||
173 | (unsigned long)fp); | ||
174 | fp = frame.next_fp; | ||
175 | |||
176 | i++; | ||
177 | } | ||
178 | |||
179 | } | ||
180 | |||
181 | /* | ||
182 | * Special trace entry if we overflow the max depth: | ||
183 | */ | ||
184 | if (i == sample_max_depth) | ||
185 | __trace_special(tr, data, -1, -1, -1); | ||
186 | |||
187 | __trace_special(tr, data, 3, current->pid, i); | ||
188 | } | ||
189 | |||
190 | static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) | ||
191 | { | ||
192 | /* trace here */ | ||
193 | timer_notify(get_irq_regs(), smp_processor_id()); | ||
194 | |||
195 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); | ||
196 | |||
197 | return HRTIMER_RESTART; | ||
198 | } | ||
199 | |||
200 | static void start_stack_timer(void *unused) | ||
201 | { | ||
202 | struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); | ||
203 | |||
204 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
205 | hrtimer->function = stack_trace_timer_fn; | ||
206 | |||
207 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
208 | HRTIMER_MODE_REL_PINNED); | ||
209 | } | ||
210 | |||
211 | static void start_stack_timers(void) | ||
212 | { | ||
213 | on_each_cpu(start_stack_timer, NULL, 1); | ||
214 | } | ||
215 | |||
216 | static void stop_stack_timer(int cpu) | ||
217 | { | ||
218 | struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); | ||
219 | |||
220 | hrtimer_cancel(hrtimer); | ||
221 | } | ||
222 | |||
223 | static void stop_stack_timers(void) | ||
224 | { | ||
225 | int cpu; | ||
226 | |||
227 | for_each_online_cpu(cpu) | ||
228 | stop_stack_timer(cpu); | ||
229 | } | ||
230 | |||
231 | static void stop_stack_trace(struct trace_array *tr) | ||
232 | { | ||
233 | mutex_lock(&sample_timer_lock); | ||
234 | stop_stack_timers(); | ||
235 | tracer_enabled = 0; | ||
236 | mutex_unlock(&sample_timer_lock); | ||
237 | } | ||
238 | |||
239 | static int stack_trace_init(struct trace_array *tr) | ||
240 | { | ||
241 | sysprof_trace = tr; | ||
242 | |||
243 | tracing_start_cmdline_record(); | ||
244 | |||
245 | mutex_lock(&sample_timer_lock); | ||
246 | start_stack_timers(); | ||
247 | tracer_enabled = 1; | ||
248 | mutex_unlock(&sample_timer_lock); | ||
249 | return 0; | ||
250 | } | ||
251 | |||
252 | static void stack_trace_reset(struct trace_array *tr) | ||
253 | { | ||
254 | tracing_stop_cmdline_record(); | ||
255 | stop_stack_trace(tr); | ||
256 | } | ||
257 | |||
258 | static struct tracer stack_trace __read_mostly = | ||
259 | { | ||
260 | .name = "sysprof", | ||
261 | .init = stack_trace_init, | ||
262 | .reset = stack_trace_reset, | ||
263 | #ifdef CONFIG_FTRACE_SELFTEST | ||
264 | .selftest = trace_selftest_startup_sysprof, | ||
265 | #endif | ||
266 | }; | ||
267 | |||
268 | __init static int init_stack_trace(void) | ||
269 | { | ||
270 | return register_tracer(&stack_trace); | ||
271 | } | ||
272 | device_initcall(init_stack_trace); | ||
273 | |||
274 | #define MAX_LONG_DIGITS 22 | ||
275 | |||
276 | static ssize_t | ||
277 | sysprof_sample_read(struct file *filp, char __user *ubuf, | ||
278 | size_t cnt, loff_t *ppos) | ||
279 | { | ||
280 | char buf[MAX_LONG_DIGITS]; | ||
281 | int r; | ||
282 | |||
283 | r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); | ||
284 | |||
285 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
286 | } | ||
287 | |||
288 | static ssize_t | ||
289 | sysprof_sample_write(struct file *filp, const char __user *ubuf, | ||
290 | size_t cnt, loff_t *ppos) | ||
291 | { | ||
292 | char buf[MAX_LONG_DIGITS]; | ||
293 | unsigned long val; | ||
294 | |||
295 | if (cnt > MAX_LONG_DIGITS-1) | ||
296 | cnt = MAX_LONG_DIGITS-1; | ||
297 | |||
298 | if (copy_from_user(&buf, ubuf, cnt)) | ||
299 | return -EFAULT; | ||
300 | |||
301 | buf[cnt] = 0; | ||
302 | |||
303 | val = simple_strtoul(buf, NULL, 10); | ||
304 | /* | ||
305 | * Enforce a minimum sample period of 100 usecs: | ||
306 | */ | ||
307 | if (val < 100) | ||
308 | val = 100; | ||
309 | |||
310 | mutex_lock(&sample_timer_lock); | ||
311 | stop_stack_timers(); | ||
312 | sample_period = val * 1000; | ||
313 | start_stack_timers(); | ||
314 | mutex_unlock(&sample_timer_lock); | ||
315 | |||
316 | return cnt; | ||
317 | } | ||
318 | |||
319 | static const struct file_operations sysprof_sample_fops = { | ||
320 | .read = sysprof_sample_read, | ||
321 | .write = sysprof_sample_write, | ||
322 | }; | ||
323 | |||
324 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer) | ||
325 | { | ||
326 | |||
327 | trace_create_file("sysprof_sample_period", 0644, | ||
328 | d_tracer, NULL, &sysprof_sample_fops); | ||
329 | } | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 000000000000..7f9c3c52ecc1 --- /dev/null +++ b/kernel/watchdog.c | |||
@@ -0,0 +1,577 @@ | |||
1 | /* | ||
2 | * Detect hard and soft lockups on a system | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * this code detects hard lockups: incidents in where on a CPU | ||
7 | * the kernel does not respond to anything except NMI. | ||
8 | * | ||
9 | * Note: Most of this code is borrowed heavily from softlockup.c, | ||
10 | * so thanks to Ingo for the initial implementation. | ||
11 | * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks | ||
12 | * to those contributors as well. | ||
13 | */ | ||
14 | |||
15 | #include <linux/mm.h> | ||
16 | #include <linux/cpu.h> | ||
17 | #include <linux/nmi.h> | ||
18 | #include <linux/init.h> | ||
19 | #include <linux/delay.h> | ||
20 | #include <linux/freezer.h> | ||
21 | #include <linux/kthread.h> | ||
22 | #include <linux/lockdep.h> | ||
23 | #include <linux/notifier.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/sysctl.h> | ||
26 | |||
27 | #include <asm/irq_regs.h> | ||
28 | #include <linux/perf_event.h> | ||
29 | |||
30 | int watchdog_enabled; | ||
31 | int __read_mostly softlockup_thresh = 60; | ||
32 | |||
33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | ||
34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
35 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | ||
36 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | ||
37 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | ||
38 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
39 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
40 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
42 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | ||
43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
44 | #endif | ||
45 | |||
46 | static int __read_mostly did_panic; | ||
47 | static int __initdata no_watchdog; | ||
48 | |||
49 | |||
50 | /* boot commands */ | ||
51 | /* | ||
52 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
53 | */ | ||
54 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
55 | static int hardlockup_panic; | ||
56 | |||
57 | static int __init hardlockup_panic_setup(char *str) | ||
58 | { | ||
59 | if (!strncmp(str, "panic", 5)) | ||
60 | hardlockup_panic = 1; | ||
61 | return 1; | ||
62 | } | ||
63 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
64 | #endif | ||
65 | |||
66 | unsigned int __read_mostly softlockup_panic = | ||
67 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
68 | |||
69 | static int __init softlockup_panic_setup(char *str) | ||
70 | { | ||
71 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
72 | |||
73 | return 1; | ||
74 | } | ||
75 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
76 | |||
77 | static int __init nowatchdog_setup(char *str) | ||
78 | { | ||
79 | no_watchdog = 1; | ||
80 | return 1; | ||
81 | } | ||
82 | __setup("nowatchdog", nowatchdog_setup); | ||
83 | |||
84 | /* deprecated */ | ||
85 | static int __init nosoftlockup_setup(char *str) | ||
86 | { | ||
87 | no_watchdog = 1; | ||
88 | return 1; | ||
89 | } | ||
90 | __setup("nosoftlockup", nosoftlockup_setup); | ||
91 | /* */ | ||
92 | |||
93 | |||
94 | /* | ||
95 | * Returns seconds, approximately. We don't need nanosecond | ||
96 | * resolution, and we don't need to waste time with a big divide when | ||
97 | * 2^30ns == 1.074s. | ||
98 | */ | ||
99 | static unsigned long get_timestamp(int this_cpu) | ||
100 | { | ||
101 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
102 | } | ||
103 | |||
104 | static unsigned long get_sample_period(void) | ||
105 | { | ||
106 | /* | ||
107 | * convert softlockup_thresh from seconds to ns | ||
108 | * the divide by 5 is to give hrtimer 5 chances to | ||
109 | * increment before the hardlockup detector generates | ||
110 | * a warning | ||
111 | */ | ||
112 | return softlockup_thresh / 5 * NSEC_PER_SEC; | ||
113 | } | ||
114 | |||
115 | /* Commands for resetting the watchdog */ | ||
116 | static void __touch_watchdog(void) | ||
117 | { | ||
118 | int this_cpu = smp_processor_id(); | ||
119 | |||
120 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | ||
121 | } | ||
122 | |||
123 | void touch_softlockup_watchdog(void) | ||
124 | { | ||
125 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | ||
126 | } | ||
127 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
128 | |||
129 | void touch_all_softlockup_watchdogs(void) | ||
130 | { | ||
131 | int cpu; | ||
132 | |||
133 | /* | ||
134 | * this is done lockless | ||
135 | * do we care if a 0 races with a timestamp? | ||
136 | * all it means is the softlock check starts one cycle later | ||
137 | */ | ||
138 | for_each_online_cpu(cpu) | ||
139 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
140 | } | ||
141 | |||
142 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
143 | void touch_nmi_watchdog(void) | ||
144 | { | ||
145 | if (watchdog_enabled) { | ||
146 | unsigned cpu; | ||
147 | |||
148 | for_each_present_cpu(cpu) { | ||
149 | if (per_cpu(watchdog_nmi_touch, cpu) != true) | ||
150 | per_cpu(watchdog_nmi_touch, cpu) = true; | ||
151 | } | ||
152 | } | ||
153 | touch_softlockup_watchdog(); | ||
154 | } | ||
155 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
156 | |||
157 | #endif | ||
158 | |||
159 | void touch_softlockup_watchdog_sync(void) | ||
160 | { | ||
161 | __raw_get_cpu_var(softlockup_touch_sync) = true; | ||
162 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | ||
163 | } | ||
164 | |||
165 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
166 | /* watchdog detector functions */ | ||
167 | static int is_hardlockup(void) | ||
168 | { | ||
169 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | ||
170 | |||
171 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | ||
172 | return 1; | ||
173 | |||
174 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | ||
175 | return 0; | ||
176 | } | ||
177 | #endif | ||
178 | |||
179 | static int is_softlockup(unsigned long touch_ts) | ||
180 | { | ||
181 | unsigned long now = get_timestamp(smp_processor_id()); | ||
182 | |||
183 | /* Warn about unreasonable delays: */ | ||
184 | if (time_after(now, touch_ts + softlockup_thresh)) | ||
185 | return now - touch_ts; | ||
186 | |||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static int | ||
191 | watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
192 | { | ||
193 | did_panic = 1; | ||
194 | |||
195 | return NOTIFY_DONE; | ||
196 | } | ||
197 | |||
198 | static struct notifier_block panic_block = { | ||
199 | .notifier_call = watchdog_panic, | ||
200 | }; | ||
201 | |||
202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
203 | static struct perf_event_attr wd_hw_attr = { | ||
204 | .type = PERF_TYPE_HARDWARE, | ||
205 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
206 | .size = sizeof(struct perf_event_attr), | ||
207 | .pinned = 1, | ||
208 | .disabled = 1, | ||
209 | }; | ||
210 | |||
211 | /* Callback function for perf event subsystem */ | ||
212 | void watchdog_overflow_callback(struct perf_event *event, int nmi, | ||
213 | struct perf_sample_data *data, | ||
214 | struct pt_regs *regs) | ||
215 | { | ||
216 | /* Ensure the watchdog never gets throttled */ | ||
217 | event->hw.interrupts = 0; | ||
218 | |||
219 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | ||
220 | __get_cpu_var(watchdog_nmi_touch) = false; | ||
221 | return; | ||
222 | } | ||
223 | |||
224 | /* check for a hardlockup | ||
225 | * This is done by making sure our timer interrupt | ||
226 | * is incrementing. The timer interrupt should have | ||
227 | * fired multiple times before we overflow'd. If it hasn't | ||
228 | * then this is a good indication the cpu is stuck | ||
229 | */ | ||
230 | if (is_hardlockup()) { | ||
231 | int this_cpu = smp_processor_id(); | ||
232 | |||
233 | /* only print hardlockups once */ | ||
234 | if (__get_cpu_var(hard_watchdog_warn) == true) | ||
235 | return; | ||
236 | |||
237 | if (hardlockup_panic) | ||
238 | panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
239 | else | ||
240 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
241 | |||
242 | __get_cpu_var(hard_watchdog_warn) = true; | ||
243 | return; | ||
244 | } | ||
245 | |||
246 | __get_cpu_var(hard_watchdog_warn) = false; | ||
247 | return; | ||
248 | } | ||
249 | static void watchdog_interrupt_count(void) | ||
250 | { | ||
251 | __get_cpu_var(hrtimer_interrupts)++; | ||
252 | } | ||
253 | #else | ||
254 | static inline void watchdog_interrupt_count(void) { return; } | ||
255 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
256 | |||
257 | /* watchdog kicker functions */ | ||
258 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | ||
259 | { | ||
260 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | ||
261 | struct pt_regs *regs = get_irq_regs(); | ||
262 | int duration; | ||
263 | |||
264 | /* kick the hardlockup detector */ | ||
265 | watchdog_interrupt_count(); | ||
266 | |||
267 | /* kick the softlockup detector */ | ||
268 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | ||
269 | |||
270 | /* .. and repeat */ | ||
271 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | ||
272 | |||
273 | if (touch_ts == 0) { | ||
274 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | ||
275 | /* | ||
276 | * If the time stamp was touched atomically | ||
277 | * make sure the scheduler tick is up to date. | ||
278 | */ | ||
279 | __get_cpu_var(softlockup_touch_sync) = false; | ||
280 | sched_clock_tick(); | ||
281 | } | ||
282 | __touch_watchdog(); | ||
283 | return HRTIMER_RESTART; | ||
284 | } | ||
285 | |||
286 | /* check for a softlockup | ||
287 | * This is done by making sure a high priority task is | ||
288 | * being scheduled. The task touches the watchdog to | ||
289 | * indicate it is getting cpu time. If it hasn't then | ||
290 | * this is a good indication some task is hogging the cpu | ||
291 | */ | ||
292 | duration = is_softlockup(touch_ts); | ||
293 | if (unlikely(duration)) { | ||
294 | /* only warn once */ | ||
295 | if (__get_cpu_var(soft_watchdog_warn) == true) | ||
296 | return HRTIMER_RESTART; | ||
297 | |||
298 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | ||
299 | smp_processor_id(), duration, | ||
300 | current->comm, task_pid_nr(current)); | ||
301 | print_modules(); | ||
302 | print_irqtrace_events(current); | ||
303 | if (regs) | ||
304 | show_regs(regs); | ||
305 | else | ||
306 | dump_stack(); | ||
307 | |||
308 | if (softlockup_panic) | ||
309 | panic("softlockup: hung tasks"); | ||
310 | __get_cpu_var(soft_watchdog_warn) = true; | ||
311 | } else | ||
312 | __get_cpu_var(soft_watchdog_warn) = false; | ||
313 | |||
314 | return HRTIMER_RESTART; | ||
315 | } | ||
316 | |||
317 | |||
318 | /* | ||
319 | * The watchdog thread - touches the timestamp. | ||
320 | */ | ||
321 | static int watchdog(void *unused) | ||
322 | { | ||
323 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
324 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
325 | |||
326 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
327 | |||
328 | /* initialize timestamp */ | ||
329 | __touch_watchdog(); | ||
330 | |||
331 | /* kick off the timer for the hardlockup detector */ | ||
332 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | ||
333 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | ||
334 | HRTIMER_MODE_REL_PINNED); | ||
335 | |||
336 | set_current_state(TASK_INTERRUPTIBLE); | ||
337 | /* | ||
338 | * Run briefly once per second to reset the softlockup timestamp. | ||
339 | * If this gets delayed for more than 60 seconds then the | ||
340 | * debug-printout triggers in watchdog_timer_fn(). | ||
341 | */ | ||
342 | while (!kthread_should_stop()) { | ||
343 | __touch_watchdog(); | ||
344 | schedule(); | ||
345 | |||
346 | if (kthread_should_stop()) | ||
347 | break; | ||
348 | |||
349 | set_current_state(TASK_INTERRUPTIBLE); | ||
350 | } | ||
351 | __set_current_state(TASK_RUNNING); | ||
352 | |||
353 | return 0; | ||
354 | } | ||
355 | |||
356 | |||
357 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
358 | static int watchdog_nmi_enable(int cpu) | ||
359 | { | ||
360 | struct perf_event_attr *wd_attr; | ||
361 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
362 | |||
363 | /* is it already setup and enabled? */ | ||
364 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
365 | goto out; | ||
366 | |||
367 | /* it is setup but not enabled */ | ||
368 | if (event != NULL) | ||
369 | goto out_enable; | ||
370 | |||
371 | /* Try to register using hardware perf events */ | ||
372 | wd_attr = &wd_hw_attr; | ||
373 | wd_attr->sample_period = hw_nmi_get_sample_period(); | ||
374 | event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); | ||
375 | if (!IS_ERR(event)) { | ||
376 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | ||
377 | goto out_save; | ||
378 | } | ||
379 | |||
380 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | ||
381 | return -1; | ||
382 | |||
383 | /* success path */ | ||
384 | out_save: | ||
385 | per_cpu(watchdog_ev, cpu) = event; | ||
386 | out_enable: | ||
387 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
388 | out: | ||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | static void watchdog_nmi_disable(int cpu) | ||
393 | { | ||
394 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
395 | |||
396 | if (event) { | ||
397 | perf_event_disable(event); | ||
398 | per_cpu(watchdog_ev, cpu) = NULL; | ||
399 | |||
400 | /* should be in cleanup, but blocks oprofile */ | ||
401 | perf_event_release_kernel(event); | ||
402 | } | ||
403 | return; | ||
404 | } | ||
405 | #else | ||
406 | static int watchdog_nmi_enable(int cpu) { return 0; } | ||
407 | static void watchdog_nmi_disable(int cpu) { return; } | ||
408 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
409 | |||
410 | /* prepare/enable/disable routines */ | ||
411 | static int watchdog_prepare_cpu(int cpu) | ||
412 | { | ||
413 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
414 | |||
415 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | ||
416 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
417 | hrtimer->function = watchdog_timer_fn; | ||
418 | |||
419 | return 0; | ||
420 | } | ||
421 | |||
422 | static int watchdog_enable(int cpu) | ||
423 | { | ||
424 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
425 | |||
426 | /* enable the perf event */ | ||
427 | if (watchdog_nmi_enable(cpu) != 0) | ||
428 | return -1; | ||
429 | |||
430 | /* create the watchdog thread */ | ||
431 | if (!p) { | ||
432 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | ||
433 | if (IS_ERR(p)) { | ||
434 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | ||
435 | return -1; | ||
436 | } | ||
437 | kthread_bind(p, cpu); | ||
438 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
439 | per_cpu(softlockup_watchdog, cpu) = p; | ||
440 | wake_up_process(p); | ||
441 | } | ||
442 | |||
443 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
444 | watchdog_enabled = 1; | ||
445 | |||
446 | return 0; | ||
447 | } | ||
448 | |||
449 | static void watchdog_disable(int cpu) | ||
450 | { | ||
451 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
452 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
453 | |||
454 | /* | ||
455 | * cancel the timer first to stop incrementing the stats | ||
456 | * and waking up the kthread | ||
457 | */ | ||
458 | hrtimer_cancel(hrtimer); | ||
459 | |||
460 | /* disable the perf event */ | ||
461 | watchdog_nmi_disable(cpu); | ||
462 | |||
463 | /* stop the watchdog thread */ | ||
464 | if (p) { | ||
465 | per_cpu(softlockup_watchdog, cpu) = NULL; | ||
466 | kthread_stop(p); | ||
467 | } | ||
468 | } | ||
469 | |||
470 | static void watchdog_enable_all_cpus(void) | ||
471 | { | ||
472 | int cpu; | ||
473 | int result = 0; | ||
474 | |||
475 | for_each_online_cpu(cpu) | ||
476 | result += watchdog_enable(cpu); | ||
477 | |||
478 | if (result) | ||
479 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | ||
480 | |||
481 | } | ||
482 | |||
483 | static void watchdog_disable_all_cpus(void) | ||
484 | { | ||
485 | int cpu; | ||
486 | |||
487 | for_each_online_cpu(cpu) | ||
488 | watchdog_disable(cpu); | ||
489 | |||
490 | /* if all watchdogs are disabled, then they are disabled for the system */ | ||
491 | watchdog_enabled = 0; | ||
492 | } | ||
493 | |||
494 | |||
495 | /* sysctl functions */ | ||
496 | #ifdef CONFIG_SYSCTL | ||
497 | /* | ||
498 | * proc handler for /proc/sys/kernel/nmi_watchdog | ||
499 | */ | ||
500 | |||
501 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | ||
502 | void __user *buffer, size_t *length, loff_t *ppos) | ||
503 | { | ||
504 | proc_dointvec(table, write, buffer, length, ppos); | ||
505 | |||
506 | if (watchdog_enabled) | ||
507 | watchdog_enable_all_cpus(); | ||
508 | else | ||
509 | watchdog_disable_all_cpus(); | ||
510 | return 0; | ||
511 | } | ||
512 | |||
513 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | ||
514 | void __user *buffer, | ||
515 | size_t *lenp, loff_t *ppos) | ||
516 | { | ||
517 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
518 | } | ||
519 | #endif /* CONFIG_SYSCTL */ | ||
520 | |||
521 | |||
522 | /* | ||
523 | * Create/destroy watchdog threads as CPUs come and go: | ||
524 | */ | ||
525 | static int __cpuinit | ||
526 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
527 | { | ||
528 | int hotcpu = (unsigned long)hcpu; | ||
529 | |||
530 | switch (action) { | ||
531 | case CPU_UP_PREPARE: | ||
532 | case CPU_UP_PREPARE_FROZEN: | ||
533 | if (watchdog_prepare_cpu(hotcpu)) | ||
534 | return NOTIFY_BAD; | ||
535 | break; | ||
536 | case CPU_ONLINE: | ||
537 | case CPU_ONLINE_FROZEN: | ||
538 | if (watchdog_enable(hotcpu)) | ||
539 | return NOTIFY_BAD; | ||
540 | break; | ||
541 | #ifdef CONFIG_HOTPLUG_CPU | ||
542 | case CPU_UP_CANCELED: | ||
543 | case CPU_UP_CANCELED_FROZEN: | ||
544 | watchdog_disable(hotcpu); | ||
545 | break; | ||
546 | case CPU_DEAD: | ||
547 | case CPU_DEAD_FROZEN: | ||
548 | watchdog_disable(hotcpu); | ||
549 | break; | ||
550 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
551 | } | ||
552 | return NOTIFY_OK; | ||
553 | } | ||
554 | |||
555 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
556 | .notifier_call = cpu_callback | ||
557 | }; | ||
558 | |||
559 | static int __init spawn_watchdog_task(void) | ||
560 | { | ||
561 | void *cpu = (void *)(long)smp_processor_id(); | ||
562 | int err; | ||
563 | |||
564 | if (no_watchdog) | ||
565 | return 0; | ||
566 | |||
567 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
568 | WARN_ON(err == NOTIFY_BAD); | ||
569 | |||
570 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
571 | register_cpu_notifier(&cpu_nfb); | ||
572 | |||
573 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
574 | |||
575 | return 0; | ||
576 | } | ||
577 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb4451..f77afd939229 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1,19 +1,26 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/workqueue.c | 2 | * kernel/workqueue.c - generic async execution with shared worker pool |
3 | * | 3 | * |
4 | * Generic mechanism for defining kernel helper threads for running | 4 | * Copyright (C) 2002 Ingo Molnar |
5 | * arbitrary tasks in process context. | ||
6 | * | 5 | * |
7 | * Started by Ingo Molnar, Copyright (C) 2002 | 6 | * Derived from the taskqueue/keventd code by: |
7 | * David Woodhouse <dwmw2@infradead.org> | ||
8 | * Andrew Morton | ||
9 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
10 | * Theodore Ts'o <tytso@mit.edu> | ||
8 | * | 11 | * |
9 | * Derived from the taskqueue/keventd code by: | 12 | * Made to use alloc_percpu by Christoph Lameter. |
10 | * | 13 | * |
11 | * David Woodhouse <dwmw2@infradead.org> | 14 | * Copyright (C) 2010 SUSE Linux Products GmbH |
12 | * Andrew Morton | 15 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | ||
14 | * Theodore Ts'o <tytso@mit.edu> | ||
15 | * | 16 | * |
16 | * Made to use alloc_percpu by Christoph Lameter. | 17 | * This is the generic async execution mechanism. Work items as are |
18 | * executed in process context. The worker pool is shared and | ||
19 | * automatically managed. There is one worker pool for each CPU and | ||
20 | * one extra for works which are better served by workers which are | ||
21 | * not bound to any specific CPU. | ||
22 | * | ||
23 | * Please read Documentation/workqueue.txt for details. | ||
17 | */ | 24 | */ |
18 | 25 | ||
19 | #include <linux/module.h> | 26 | #include <linux/module.h> |
@@ -33,41 +40,291 @@ | |||
33 | #include <linux/kallsyms.h> | 40 | #include <linux/kallsyms.h> |
34 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
35 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
43 | #include <linux/idr.h> | ||
44 | |||
36 | #define CREATE_TRACE_POINTS | 45 | #define CREATE_TRACE_POINTS |
37 | #include <trace/events/workqueue.h> | 46 | #include <trace/events/workqueue.h> |
38 | 47 | ||
48 | #include "workqueue_sched.h" | ||
49 | |||
50 | enum { | ||
51 | /* global_cwq flags */ | ||
52 | GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
53 | GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
54 | GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | ||
55 | GCWQ_FREEZING = 1 << 3, /* freeze in progress */ | ||
56 | GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ | ||
57 | |||
58 | /* worker flags */ | ||
59 | WORKER_STARTED = 1 << 0, /* started */ | ||
60 | WORKER_DIE = 1 << 1, /* die die die */ | ||
61 | WORKER_IDLE = 1 << 2, /* is idle */ | ||
62 | WORKER_PREP = 1 << 3, /* preparing to run works */ | ||
63 | WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ | ||
64 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | ||
65 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | ||
66 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | ||
67 | |||
68 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | | ||
69 | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, | ||
70 | |||
71 | /* gcwq->trustee_state */ | ||
72 | TRUSTEE_START = 0, /* start */ | ||
73 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
74 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
75 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
76 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
77 | |||
78 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | ||
79 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | ||
80 | BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, | ||
81 | |||
82 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | ||
83 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | ||
84 | |||
85 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | ||
86 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | ||
87 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | ||
88 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
89 | |||
90 | /* | ||
91 | * Rescue workers are used only on emergencies and shared by | ||
92 | * all cpus. Give -20. | ||
93 | */ | ||
94 | RESCUER_NICE_LEVEL = -20, | ||
95 | }; | ||
96 | |||
97 | /* | ||
98 | * Structure fields follow one of the following exclusion rules. | ||
99 | * | ||
100 | * I: Modifiable by initialization/destruction paths and read-only for | ||
101 | * everyone else. | ||
102 | * | ||
103 | * P: Preemption protected. Disabling preemption is enough and should | ||
104 | * only be modified and accessed from the local cpu. | ||
105 | * | ||
106 | * L: gcwq->lock protected. Access with gcwq->lock held. | ||
107 | * | ||
108 | * X: During normal operation, modification requires gcwq->lock and | ||
109 | * should be done only from local cpu. Either disabling preemption | ||
110 | * on local cpu or grabbing gcwq->lock is enough for read access. | ||
111 | * If GCWQ_DISASSOCIATED is set, it's identical to L. | ||
112 | * | ||
113 | * F: wq->flush_mutex protected. | ||
114 | * | ||
115 | * W: workqueue_lock protected. | ||
116 | */ | ||
117 | |||
118 | struct global_cwq; | ||
119 | |||
39 | /* | 120 | /* |
40 | * The per-CPU workqueue (if single thread, we always use the first | 121 | * The poor guys doing the actual heavy lifting. All on-duty workers |
41 | * possible cpu). | 122 | * are either serving the manager role, on idle list or on busy hash. |
42 | */ | 123 | */ |
43 | struct cpu_workqueue_struct { | 124 | struct worker { |
125 | /* on idle list while idle, on busy hash table while busy */ | ||
126 | union { | ||
127 | struct list_head entry; /* L: while idle */ | ||
128 | struct hlist_node hentry; /* L: while busy */ | ||
129 | }; | ||
44 | 130 | ||
45 | spinlock_t lock; | 131 | struct work_struct *current_work; /* L: work being processed */ |
132 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ | ||
133 | struct list_head scheduled; /* L: scheduled works */ | ||
134 | struct task_struct *task; /* I: worker task */ | ||
135 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
136 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
137 | unsigned long last_active; /* L: last active timestamp */ | ||
138 | unsigned int flags; /* X: flags */ | ||
139 | int id; /* I: worker id */ | ||
140 | struct work_struct rebind_work; /* L: rebind worker to cpu */ | ||
141 | }; | ||
46 | 142 | ||
47 | struct list_head worklist; | 143 | /* |
48 | wait_queue_head_t more_work; | 144 | * Global per-cpu workqueue. There's one and only one for each cpu |
49 | struct work_struct *current_work; | 145 | * and all works are queued and processed here regardless of their |
146 | * target workqueues. | ||
147 | */ | ||
148 | struct global_cwq { | ||
149 | spinlock_t lock; /* the gcwq lock */ | ||
150 | struct list_head worklist; /* L: list of pending works */ | ||
151 | unsigned int cpu; /* I: the associated cpu */ | ||
152 | unsigned int flags; /* L: GCWQ_* flags */ | ||
50 | 153 | ||
51 | struct workqueue_struct *wq; | 154 | int nr_workers; /* L: total number of workers */ |
52 | struct task_struct *thread; | 155 | int nr_idle; /* L: currently idle ones */ |
53 | } ____cacheline_aligned; | 156 | |
157 | /* workers are chained either in the idle_list or busy_hash */ | ||
158 | struct list_head idle_list; /* X: list of idle workers */ | ||
159 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | ||
160 | /* L: hash of busy workers */ | ||
161 | |||
162 | struct timer_list idle_timer; /* L: worker idle timeout */ | ||
163 | struct timer_list mayday_timer; /* L: SOS timer for dworkers */ | ||
164 | |||
165 | struct ida worker_ida; /* L: for worker IDs */ | ||
166 | |||
167 | struct task_struct *trustee; /* L: for gcwq shutdown */ | ||
168 | unsigned int trustee_state; /* L: trustee state */ | ||
169 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
170 | struct worker *first_idle; /* L: first idle worker */ | ||
171 | } ____cacheline_aligned_in_smp; | ||
172 | |||
173 | /* | ||
174 | * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of | ||
175 | * work_struct->data are used for flags and thus cwqs need to be | ||
176 | * aligned at two's power of the number of flag bits. | ||
177 | */ | ||
178 | struct cpu_workqueue_struct { | ||
179 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
180 | struct workqueue_struct *wq; /* I: the owning workqueue */ | ||
181 | int work_color; /* L: current color */ | ||
182 | int flush_color; /* L: flushing color */ | ||
183 | int nr_in_flight[WORK_NR_COLORS]; | ||
184 | /* L: nr of in_flight works */ | ||
185 | int nr_active; /* L: nr of active works */ | ||
186 | int max_active; /* L: max active works */ | ||
187 | struct list_head delayed_works; /* L: delayed works */ | ||
188 | }; | ||
189 | |||
190 | /* | ||
191 | * Structure used to wait for workqueue flush. | ||
192 | */ | ||
193 | struct wq_flusher { | ||
194 | struct list_head list; /* F: list of flushers */ | ||
195 | int flush_color; /* F: flush color waiting for */ | ||
196 | struct completion done; /* flush completion */ | ||
197 | }; | ||
198 | |||
199 | /* | ||
200 | * All cpumasks are assumed to be always set on UP and thus can't be | ||
201 | * used to determine whether there's something to be done. | ||
202 | */ | ||
203 | #ifdef CONFIG_SMP | ||
204 | typedef cpumask_var_t mayday_mask_t; | ||
205 | #define mayday_test_and_set_cpu(cpu, mask) \ | ||
206 | cpumask_test_and_set_cpu((cpu), (mask)) | ||
207 | #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) | ||
208 | #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) | ||
209 | #define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) | ||
210 | #define free_mayday_mask(mask) free_cpumask_var((mask)) | ||
211 | #else | ||
212 | typedef unsigned long mayday_mask_t; | ||
213 | #define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) | ||
214 | #define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) | ||
215 | #define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) | ||
216 | #define alloc_mayday_mask(maskp, gfp) true | ||
217 | #define free_mayday_mask(mask) do { } while (0) | ||
218 | #endif | ||
54 | 219 | ||
55 | /* | 220 | /* |
56 | * The externally visible workqueue abstraction is an array of | 221 | * The externally visible workqueue abstraction is an array of |
57 | * per-CPU workqueues: | 222 | * per-CPU workqueues: |
58 | */ | 223 | */ |
59 | struct workqueue_struct { | 224 | struct workqueue_struct { |
60 | struct cpu_workqueue_struct *cpu_wq; | 225 | unsigned int flags; /* I: WQ_* flags */ |
61 | struct list_head list; | 226 | union { |
62 | const char *name; | 227 | struct cpu_workqueue_struct __percpu *pcpu; |
63 | int singlethread; | 228 | struct cpu_workqueue_struct *single; |
64 | int freezeable; /* Freeze threads during suspend */ | 229 | unsigned long v; |
65 | int rt; | 230 | } cpu_wq; /* I: cwq's */ |
231 | struct list_head list; /* W: list of all workqueues */ | ||
232 | |||
233 | struct mutex flush_mutex; /* protects wq flushing */ | ||
234 | int work_color; /* F: current work color */ | ||
235 | int flush_color; /* F: current flush color */ | ||
236 | atomic_t nr_cwqs_to_flush; /* flush in progress */ | ||
237 | struct wq_flusher *first_flusher; /* F: first flusher */ | ||
238 | struct list_head flusher_queue; /* F: flush waiters */ | ||
239 | struct list_head flusher_overflow; /* F: flush overflow list */ | ||
240 | |||
241 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | ||
242 | struct worker *rescuer; /* I: rescue worker */ | ||
243 | |||
244 | int saved_max_active; /* W: saved cwq max_active */ | ||
245 | const char *name; /* I: workqueue name */ | ||
66 | #ifdef CONFIG_LOCKDEP | 246 | #ifdef CONFIG_LOCKDEP |
67 | struct lockdep_map lockdep_map; | 247 | struct lockdep_map lockdep_map; |
68 | #endif | 248 | #endif |
69 | }; | 249 | }; |
70 | 250 | ||
251 | struct workqueue_struct *system_wq __read_mostly; | ||
252 | struct workqueue_struct *system_long_wq __read_mostly; | ||
253 | struct workqueue_struct *system_nrt_wq __read_mostly; | ||
254 | struct workqueue_struct *system_unbound_wq __read_mostly; | ||
255 | EXPORT_SYMBOL_GPL(system_wq); | ||
256 | EXPORT_SYMBOL_GPL(system_long_wq); | ||
257 | EXPORT_SYMBOL_GPL(system_nrt_wq); | ||
258 | EXPORT_SYMBOL_GPL(system_unbound_wq); | ||
259 | |||
260 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | ||
261 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | ||
262 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | ||
263 | |||
264 | static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, | ||
265 | unsigned int sw) | ||
266 | { | ||
267 | if (cpu < nr_cpu_ids) { | ||
268 | if (sw & 1) { | ||
269 | cpu = cpumask_next(cpu, mask); | ||
270 | if (cpu < nr_cpu_ids) | ||
271 | return cpu; | ||
272 | } | ||
273 | if (sw & 2) | ||
274 | return WORK_CPU_UNBOUND; | ||
275 | } | ||
276 | return WORK_CPU_NONE; | ||
277 | } | ||
278 | |||
279 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | ||
280 | struct workqueue_struct *wq) | ||
281 | { | ||
282 | return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * CPU iterators | ||
287 | * | ||
288 | * An extra gcwq is defined for an invalid cpu number | ||
289 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any | ||
290 | * specific CPU. The following iterators are similar to | ||
291 | * for_each_*_cpu() iterators but also considers the unbound gcwq. | ||
292 | * | ||
293 | * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND | ||
294 | * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND | ||
295 | * for_each_cwq_cpu() : possible CPUs for bound workqueues, | ||
296 | * WORK_CPU_UNBOUND for unbound workqueues | ||
297 | */ | ||
298 | #define for_each_gcwq_cpu(cpu) \ | ||
299 | for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ | ||
300 | (cpu) < WORK_CPU_NONE; \ | ||
301 | (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) | ||
302 | |||
303 | #define for_each_online_gcwq_cpu(cpu) \ | ||
304 | for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ | ||
305 | (cpu) < WORK_CPU_NONE; \ | ||
306 | (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) | ||
307 | |||
308 | #define for_each_cwq_cpu(cpu, wq) \ | ||
309 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ | ||
310 | (cpu) < WORK_CPU_NONE; \ | ||
311 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | ||
312 | |||
313 | #ifdef CONFIG_LOCKDEP | ||
314 | /** | ||
315 | * in_workqueue_context() - in context of specified workqueue? | ||
316 | * @wq: the workqueue of interest | ||
317 | * | ||
318 | * Checks lockdep state to see if the current task is executing from | ||
319 | * within a workqueue item. This function exists only if lockdep is | ||
320 | * enabled. | ||
321 | */ | ||
322 | int in_workqueue_context(struct workqueue_struct *wq) | ||
323 | { | ||
324 | return lock_is_held(&wq->lockdep_map); | ||
325 | } | ||
326 | #endif | ||
327 | |||
71 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 328 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
72 | 329 | ||
73 | static struct debug_obj_descr work_debug_descr; | 330 | static struct debug_obj_descr work_debug_descr; |
@@ -107,7 +364,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) | |||
107 | * statically initialized. We just make sure that it | 364 | * statically initialized. We just make sure that it |
108 | * is tracked in the object tracker. | 365 | * is tracked in the object tracker. |
109 | */ | 366 | */ |
110 | if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { | 367 | if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { |
111 | debug_object_init(work, &work_debug_descr); | 368 | debug_object_init(work, &work_debug_descr); |
112 | debug_object_activate(work, &work_debug_descr); | 369 | debug_object_activate(work, &work_debug_descr); |
113 | return 0; | 370 | return 0; |
@@ -181,94 +438,582 @@ static inline void debug_work_deactivate(struct work_struct *work) { } | |||
181 | /* Serializes the accesses to the list of workqueues. */ | 438 | /* Serializes the accesses to the list of workqueues. */ |
182 | static DEFINE_SPINLOCK(workqueue_lock); | 439 | static DEFINE_SPINLOCK(workqueue_lock); |
183 | static LIST_HEAD(workqueues); | 440 | static LIST_HEAD(workqueues); |
441 | static bool workqueue_freezing; /* W: have wqs started freezing? */ | ||
184 | 442 | ||
185 | static int singlethread_cpu __read_mostly; | ||
186 | static const struct cpumask *cpu_singlethread_map __read_mostly; | ||
187 | /* | 443 | /* |
188 | * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD | 444 | * The almighty global cpu workqueues. nr_running is the only field |
189 | * flushes cwq->worklist. This means that flush_workqueue/wait_on_work | 445 | * which is expected to be used frequently by other cpus via |
190 | * which comes in between can't use for_each_online_cpu(). We could | 446 | * try_to_wake_up(). Put it in a separate cacheline. |
191 | * use cpu_possible_map, the cpumask below is more a documentation | ||
192 | * than optimization. | ||
193 | */ | 447 | */ |
194 | static cpumask_var_t cpu_populated_map __read_mostly; | 448 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); |
449 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | ||
450 | |||
451 | /* | ||
452 | * Global cpu workqueue and nr_running counter for unbound gcwq. The | ||
453 | * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its | ||
454 | * workers have WORKER_UNBOUND set. | ||
455 | */ | ||
456 | static struct global_cwq unbound_global_cwq; | ||
457 | static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ | ||
458 | |||
459 | static int worker_thread(void *__worker); | ||
460 | |||
461 | static struct global_cwq *get_gcwq(unsigned int cpu) | ||
462 | { | ||
463 | if (cpu != WORK_CPU_UNBOUND) | ||
464 | return &per_cpu(global_cwq, cpu); | ||
465 | else | ||
466 | return &unbound_global_cwq; | ||
467 | } | ||
195 | 468 | ||
196 | /* If it's single threaded, it isn't in the list of workqueues. */ | 469 | static atomic_t *get_gcwq_nr_running(unsigned int cpu) |
197 | static inline int is_wq_single_threaded(struct workqueue_struct *wq) | ||
198 | { | 470 | { |
199 | return wq->singlethread; | 471 | if (cpu != WORK_CPU_UNBOUND) |
472 | return &per_cpu(gcwq_nr_running, cpu); | ||
473 | else | ||
474 | return &unbound_gcwq_nr_running; | ||
475 | } | ||
476 | |||
477 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, | ||
478 | struct workqueue_struct *wq) | ||
479 | { | ||
480 | if (!(wq->flags & WQ_UNBOUND)) { | ||
481 | if (likely(cpu < nr_cpu_ids)) { | ||
482 | #ifdef CONFIG_SMP | ||
483 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); | ||
484 | #else | ||
485 | return wq->cpu_wq.single; | ||
486 | #endif | ||
487 | } | ||
488 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | ||
489 | return wq->cpu_wq.single; | ||
490 | return NULL; | ||
200 | } | 491 | } |
201 | 492 | ||
202 | static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) | 493 | static unsigned int work_color_to_flags(int color) |
203 | { | 494 | { |
204 | return is_wq_single_threaded(wq) | 495 | return color << WORK_STRUCT_COLOR_SHIFT; |
205 | ? cpu_singlethread_map : cpu_populated_map; | ||
206 | } | 496 | } |
207 | 497 | ||
208 | static | 498 | static int get_work_color(struct work_struct *work) |
209 | struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) | ||
210 | { | 499 | { |
211 | if (unlikely(is_wq_single_threaded(wq))) | 500 | return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & |
212 | cpu = singlethread_cpu; | 501 | ((1 << WORK_STRUCT_COLOR_BITS) - 1); |
213 | return per_cpu_ptr(wq->cpu_wq, cpu); | 502 | } |
503 | |||
504 | static int work_next_color(int color) | ||
505 | { | ||
506 | return (color + 1) % WORK_NR_COLORS; | ||
214 | } | 507 | } |
215 | 508 | ||
216 | /* | 509 | /* |
217 | * Set the workqueue on which a work item is to be run | 510 | * A work's data points to the cwq with WORK_STRUCT_CWQ set while the |
218 | * - Must *only* be called if the pending flag is set | 511 | * work is on queue. Once execution starts, WORK_STRUCT_CWQ is |
512 | * cleared and the work data contains the cpu number it was last on. | ||
513 | * | ||
514 | * set_work_{cwq|cpu}() and clear_work_data() can be used to set the | ||
515 | * cwq, cpu or clear work->data. These functions should only be | ||
516 | * called while the work is owned - ie. while the PENDING bit is set. | ||
517 | * | ||
518 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq | ||
519 | * corresponding to a work. gcwq is available once the work has been | ||
520 | * queued anywhere after initialization. cwq is available only from | ||
521 | * queueing until execution starts. | ||
219 | */ | 522 | */ |
220 | static inline void set_wq_data(struct work_struct *work, | 523 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
221 | struct cpu_workqueue_struct *cwq) | 524 | unsigned long flags) |
222 | { | 525 | { |
223 | unsigned long new; | ||
224 | |||
225 | BUG_ON(!work_pending(work)); | 526 | BUG_ON(!work_pending(work)); |
527 | atomic_long_set(&work->data, data | flags | work_static(work)); | ||
528 | } | ||
226 | 529 | ||
227 | new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); | 530 | static void set_work_cwq(struct work_struct *work, |
228 | new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); | 531 | struct cpu_workqueue_struct *cwq, |
229 | atomic_long_set(&work->data, new); | 532 | unsigned long extra_flags) |
533 | { | ||
534 | set_work_data(work, (unsigned long)cwq, | ||
535 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | ||
536 | } | ||
537 | |||
538 | static void set_work_cpu(struct work_struct *work, unsigned int cpu) | ||
539 | { | ||
540 | set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); | ||
541 | } | ||
542 | |||
543 | static void clear_work_data(struct work_struct *work) | ||
544 | { | ||
545 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | ||
546 | } | ||
547 | |||
548 | static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) | ||
549 | { | ||
550 | unsigned long data = atomic_long_read(&work->data); | ||
551 | |||
552 | if (data & WORK_STRUCT_CWQ) | ||
553 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); | ||
554 | else | ||
555 | return NULL; | ||
556 | } | ||
557 | |||
558 | static struct global_cwq *get_work_gcwq(struct work_struct *work) | ||
559 | { | ||
560 | unsigned long data = atomic_long_read(&work->data); | ||
561 | unsigned int cpu; | ||
562 | |||
563 | if (data & WORK_STRUCT_CWQ) | ||
564 | return ((struct cpu_workqueue_struct *) | ||
565 | (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; | ||
566 | |||
567 | cpu = data >> WORK_STRUCT_FLAG_BITS; | ||
568 | if (cpu == WORK_CPU_NONE) | ||
569 | return NULL; | ||
570 | |||
571 | BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); | ||
572 | return get_gcwq(cpu); | ||
573 | } | ||
574 | |||
575 | /* | ||
576 | * Policy functions. These define the policies on how the global | ||
577 | * worker pool is managed. Unless noted otherwise, these functions | ||
578 | * assume that they're being called with gcwq->lock held. | ||
579 | */ | ||
580 | |||
581 | static bool __need_more_worker(struct global_cwq *gcwq) | ||
582 | { | ||
583 | return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || | ||
584 | gcwq->flags & GCWQ_HIGHPRI_PENDING; | ||
230 | } | 585 | } |
231 | 586 | ||
232 | /* | 587 | /* |
233 | * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. | 588 | * Need to wake up a worker? Called from anything but currently |
589 | * running workers. | ||
234 | */ | 590 | */ |
235 | static inline void clear_wq_data(struct work_struct *work) | 591 | static bool need_more_worker(struct global_cwq *gcwq) |
236 | { | 592 | { |
237 | unsigned long flags = *work_data_bits(work) & | 593 | return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); |
238 | (1UL << WORK_STRUCT_STATIC); | ||
239 | atomic_long_set(&work->data, flags); | ||
240 | } | 594 | } |
241 | 595 | ||
242 | static inline | 596 | /* Can I start working? Called from busy but !running workers. */ |
243 | struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) | 597 | static bool may_start_working(struct global_cwq *gcwq) |
244 | { | 598 | { |
245 | return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); | 599 | return gcwq->nr_idle; |
246 | } | 600 | } |
247 | 601 | ||
602 | /* Do I need to keep working? Called from currently running workers. */ | ||
603 | static bool keep_working(struct global_cwq *gcwq) | ||
604 | { | ||
605 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
606 | |||
607 | return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; | ||
608 | } | ||
609 | |||
610 | /* Do we need a new worker? Called from manager. */ | ||
611 | static bool need_to_create_worker(struct global_cwq *gcwq) | ||
612 | { | ||
613 | return need_more_worker(gcwq) && !may_start_working(gcwq); | ||
614 | } | ||
615 | |||
616 | /* Do I need to be the manager? */ | ||
617 | static bool need_to_manage_workers(struct global_cwq *gcwq) | ||
618 | { | ||
619 | return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; | ||
620 | } | ||
621 | |||
622 | /* Do we have too many workers and should some go away? */ | ||
623 | static bool too_many_workers(struct global_cwq *gcwq) | ||
624 | { | ||
625 | bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; | ||
626 | int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ | ||
627 | int nr_busy = gcwq->nr_workers - nr_idle; | ||
628 | |||
629 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | ||
630 | } | ||
631 | |||
632 | /* | ||
633 | * Wake up functions. | ||
634 | */ | ||
635 | |||
636 | /* Return the first worker. Safe with preemption disabled */ | ||
637 | static struct worker *first_worker(struct global_cwq *gcwq) | ||
638 | { | ||
639 | if (unlikely(list_empty(&gcwq->idle_list))) | ||
640 | return NULL; | ||
641 | |||
642 | return list_first_entry(&gcwq->idle_list, struct worker, entry); | ||
643 | } | ||
644 | |||
645 | /** | ||
646 | * wake_up_worker - wake up an idle worker | ||
647 | * @gcwq: gcwq to wake worker for | ||
648 | * | ||
649 | * Wake up the first idle worker of @gcwq. | ||
650 | * | ||
651 | * CONTEXT: | ||
652 | * spin_lock_irq(gcwq->lock). | ||
653 | */ | ||
654 | static void wake_up_worker(struct global_cwq *gcwq) | ||
655 | { | ||
656 | struct worker *worker = first_worker(gcwq); | ||
657 | |||
658 | if (likely(worker)) | ||
659 | wake_up_process(worker->task); | ||
660 | } | ||
661 | |||
662 | /** | ||
663 | * wq_worker_waking_up - a worker is waking up | ||
664 | * @task: task waking up | ||
665 | * @cpu: CPU @task is waking up to | ||
666 | * | ||
667 | * This function is called during try_to_wake_up() when a worker is | ||
668 | * being awoken. | ||
669 | * | ||
670 | * CONTEXT: | ||
671 | * spin_lock_irq(rq->lock) | ||
672 | */ | ||
673 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | ||
674 | { | ||
675 | struct worker *worker = kthread_data(task); | ||
676 | |||
677 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | ||
678 | atomic_inc(get_gcwq_nr_running(cpu)); | ||
679 | } | ||
680 | |||
681 | /** | ||
682 | * wq_worker_sleeping - a worker is going to sleep | ||
683 | * @task: task going to sleep | ||
684 | * @cpu: CPU in question, must be the current CPU number | ||
685 | * | ||
686 | * This function is called during schedule() when a busy worker is | ||
687 | * going to sleep. Worker on the same cpu can be woken up by | ||
688 | * returning pointer to its task. | ||
689 | * | ||
690 | * CONTEXT: | ||
691 | * spin_lock_irq(rq->lock) | ||
692 | * | ||
693 | * RETURNS: | ||
694 | * Worker task on @cpu to wake up, %NULL if none. | ||
695 | */ | ||
696 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
697 | unsigned int cpu) | ||
698 | { | ||
699 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | ||
700 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
701 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | ||
702 | |||
703 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | ||
704 | return NULL; | ||
705 | |||
706 | /* this can only happen on the local cpu */ | ||
707 | BUG_ON(cpu != raw_smp_processor_id()); | ||
708 | |||
709 | /* | ||
710 | * The counterpart of the following dec_and_test, implied mb, | ||
711 | * worklist not empty test sequence is in insert_work(). | ||
712 | * Please read comment there. | ||
713 | * | ||
714 | * NOT_RUNNING is clear. This means that trustee is not in | ||
715 | * charge and we're running on the local cpu w/ rq lock held | ||
716 | * and preemption disabled, which in turn means that none else | ||
717 | * could be manipulating idle_list, so dereferencing idle_list | ||
718 | * without gcwq lock is safe. | ||
719 | */ | ||
720 | if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) | ||
721 | to_wakeup = first_worker(gcwq); | ||
722 | return to_wakeup ? to_wakeup->task : NULL; | ||
723 | } | ||
724 | |||
725 | /** | ||
726 | * worker_set_flags - set worker flags and adjust nr_running accordingly | ||
727 | * @worker: self | ||
728 | * @flags: flags to set | ||
729 | * @wakeup: wakeup an idle worker if necessary | ||
730 | * | ||
731 | * Set @flags in @worker->flags and adjust nr_running accordingly. If | ||
732 | * nr_running becomes zero and @wakeup is %true, an idle worker is | ||
733 | * woken up. | ||
734 | * | ||
735 | * CONTEXT: | ||
736 | * spin_lock_irq(gcwq->lock) | ||
737 | */ | ||
738 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | ||
739 | bool wakeup) | ||
740 | { | ||
741 | struct global_cwq *gcwq = worker->gcwq; | ||
742 | |||
743 | WARN_ON_ONCE(worker->task != current); | ||
744 | |||
745 | /* | ||
746 | * If transitioning into NOT_RUNNING, adjust nr_running and | ||
747 | * wake up an idle worker as necessary if requested by | ||
748 | * @wakeup. | ||
749 | */ | ||
750 | if ((flags & WORKER_NOT_RUNNING) && | ||
751 | !(worker->flags & WORKER_NOT_RUNNING)) { | ||
752 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
753 | |||
754 | if (wakeup) { | ||
755 | if (atomic_dec_and_test(nr_running) && | ||
756 | !list_empty(&gcwq->worklist)) | ||
757 | wake_up_worker(gcwq); | ||
758 | } else | ||
759 | atomic_dec(nr_running); | ||
760 | } | ||
761 | |||
762 | worker->flags |= flags; | ||
763 | } | ||
764 | |||
765 | /** | ||
766 | * worker_clr_flags - clear worker flags and adjust nr_running accordingly | ||
767 | * @worker: self | ||
768 | * @flags: flags to clear | ||
769 | * | ||
770 | * Clear @flags in @worker->flags and adjust nr_running accordingly. | ||
771 | * | ||
772 | * CONTEXT: | ||
773 | * spin_lock_irq(gcwq->lock) | ||
774 | */ | ||
775 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | ||
776 | { | ||
777 | struct global_cwq *gcwq = worker->gcwq; | ||
778 | unsigned int oflags = worker->flags; | ||
779 | |||
780 | WARN_ON_ONCE(worker->task != current); | ||
781 | |||
782 | worker->flags &= ~flags; | ||
783 | |||
784 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | ||
785 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | ||
786 | if (!(worker->flags & WORKER_NOT_RUNNING)) | ||
787 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | ||
788 | } | ||
789 | |||
790 | /** | ||
791 | * busy_worker_head - return the busy hash head for a work | ||
792 | * @gcwq: gcwq of interest | ||
793 | * @work: work to be hashed | ||
794 | * | ||
795 | * Return hash head of @gcwq for @work. | ||
796 | * | ||
797 | * CONTEXT: | ||
798 | * spin_lock_irq(gcwq->lock). | ||
799 | * | ||
800 | * RETURNS: | ||
801 | * Pointer to the hash head. | ||
802 | */ | ||
803 | static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, | ||
804 | struct work_struct *work) | ||
805 | { | ||
806 | const int base_shift = ilog2(sizeof(struct work_struct)); | ||
807 | unsigned long v = (unsigned long)work; | ||
808 | |||
809 | /* simple shift and fold hash, do we need something better? */ | ||
810 | v >>= base_shift; | ||
811 | v += v >> BUSY_WORKER_HASH_ORDER; | ||
812 | v &= BUSY_WORKER_HASH_MASK; | ||
813 | |||
814 | return &gcwq->busy_hash[v]; | ||
815 | } | ||
816 | |||
817 | /** | ||
818 | * __find_worker_executing_work - find worker which is executing a work | ||
819 | * @gcwq: gcwq of interest | ||
820 | * @bwh: hash head as returned by busy_worker_head() | ||
821 | * @work: work to find worker for | ||
822 | * | ||
823 | * Find a worker which is executing @work on @gcwq. @bwh should be | ||
824 | * the hash head obtained by calling busy_worker_head() with the same | ||
825 | * work. | ||
826 | * | ||
827 | * CONTEXT: | ||
828 | * spin_lock_irq(gcwq->lock). | ||
829 | * | ||
830 | * RETURNS: | ||
831 | * Pointer to worker which is executing @work if found, NULL | ||
832 | * otherwise. | ||
833 | */ | ||
834 | static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, | ||
835 | struct hlist_head *bwh, | ||
836 | struct work_struct *work) | ||
837 | { | ||
838 | struct worker *worker; | ||
839 | struct hlist_node *tmp; | ||
840 | |||
841 | hlist_for_each_entry(worker, tmp, bwh, hentry) | ||
842 | if (worker->current_work == work) | ||
843 | return worker; | ||
844 | return NULL; | ||
845 | } | ||
846 | |||
847 | /** | ||
848 | * find_worker_executing_work - find worker which is executing a work | ||
849 | * @gcwq: gcwq of interest | ||
850 | * @work: work to find worker for | ||
851 | * | ||
852 | * Find a worker which is executing @work on @gcwq. This function is | ||
853 | * identical to __find_worker_executing_work() except that this | ||
854 | * function calculates @bwh itself. | ||
855 | * | ||
856 | * CONTEXT: | ||
857 | * spin_lock_irq(gcwq->lock). | ||
858 | * | ||
859 | * RETURNS: | ||
860 | * Pointer to worker which is executing @work if found, NULL | ||
861 | * otherwise. | ||
862 | */ | ||
863 | static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | ||
864 | struct work_struct *work) | ||
865 | { | ||
866 | return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), | ||
867 | work); | ||
868 | } | ||
869 | |||
870 | /** | ||
871 | * gcwq_determine_ins_pos - find insertion position | ||
872 | * @gcwq: gcwq of interest | ||
873 | * @cwq: cwq a work is being queued for | ||
874 | * | ||
875 | * A work for @cwq is about to be queued on @gcwq, determine insertion | ||
876 | * position for the work. If @cwq is for HIGHPRI wq, the work is | ||
877 | * queued at the head of the queue but in FIFO order with respect to | ||
878 | * other HIGHPRI works; otherwise, at the end of the queue. This | ||
879 | * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that | ||
880 | * there are HIGHPRI works pending. | ||
881 | * | ||
882 | * CONTEXT: | ||
883 | * spin_lock_irq(gcwq->lock). | ||
884 | * | ||
885 | * RETURNS: | ||
886 | * Pointer to inserstion position. | ||
887 | */ | ||
888 | static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, | ||
889 | struct cpu_workqueue_struct *cwq) | ||
890 | { | ||
891 | struct work_struct *twork; | ||
892 | |||
893 | if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) | ||
894 | return &gcwq->worklist; | ||
895 | |||
896 | list_for_each_entry(twork, &gcwq->worklist, entry) { | ||
897 | struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); | ||
898 | |||
899 | if (!(tcwq->wq->flags & WQ_HIGHPRI)) | ||
900 | break; | ||
901 | } | ||
902 | |||
903 | gcwq->flags |= GCWQ_HIGHPRI_PENDING; | ||
904 | return &twork->entry; | ||
905 | } | ||
906 | |||
907 | /** | ||
908 | * insert_work - insert a work into gcwq | ||
909 | * @cwq: cwq @work belongs to | ||
910 | * @work: work to insert | ||
911 | * @head: insertion point | ||
912 | * @extra_flags: extra WORK_STRUCT_* flags to set | ||
913 | * | ||
914 | * Insert @work which belongs to @cwq into @gcwq after @head. | ||
915 | * @extra_flags is or'd to work_struct flags. | ||
916 | * | ||
917 | * CONTEXT: | ||
918 | * spin_lock_irq(gcwq->lock). | ||
919 | */ | ||
248 | static void insert_work(struct cpu_workqueue_struct *cwq, | 920 | static void insert_work(struct cpu_workqueue_struct *cwq, |
249 | struct work_struct *work, struct list_head *head) | 921 | struct work_struct *work, struct list_head *head, |
922 | unsigned int extra_flags) | ||
250 | { | 923 | { |
251 | trace_workqueue_insertion(cwq->thread, work); | 924 | struct global_cwq *gcwq = cwq->gcwq; |
925 | |||
926 | /* we own @work, set data and link */ | ||
927 | set_work_cwq(work, cwq, extra_flags); | ||
252 | 928 | ||
253 | set_wq_data(work, cwq); | ||
254 | /* | 929 | /* |
255 | * Ensure that we get the right work->data if we see the | 930 | * Ensure that we get the right work->data if we see the |
256 | * result of list_add() below, see try_to_grab_pending(). | 931 | * result of list_add() below, see try_to_grab_pending(). |
257 | */ | 932 | */ |
258 | smp_wmb(); | 933 | smp_wmb(); |
934 | |||
259 | list_add_tail(&work->entry, head); | 935 | list_add_tail(&work->entry, head); |
260 | wake_up(&cwq->more_work); | 936 | |
937 | /* | ||
938 | * Ensure either worker_sched_deactivated() sees the above | ||
939 | * list_add_tail() or we see zero nr_running to avoid workers | ||
940 | * lying around lazily while there are works to be processed. | ||
941 | */ | ||
942 | smp_mb(); | ||
943 | |||
944 | if (__need_more_worker(gcwq)) | ||
945 | wake_up_worker(gcwq); | ||
261 | } | 946 | } |
262 | 947 | ||
263 | static void __queue_work(struct cpu_workqueue_struct *cwq, | 948 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
264 | struct work_struct *work) | 949 | struct work_struct *work) |
265 | { | 950 | { |
951 | struct global_cwq *gcwq; | ||
952 | struct cpu_workqueue_struct *cwq; | ||
953 | struct list_head *worklist; | ||
954 | unsigned int work_flags; | ||
266 | unsigned long flags; | 955 | unsigned long flags; |
267 | 956 | ||
268 | debug_work_activate(work); | 957 | debug_work_activate(work); |
269 | spin_lock_irqsave(&cwq->lock, flags); | 958 | |
270 | insert_work(cwq, work, &cwq->worklist); | 959 | if (WARN_ON_ONCE(wq->flags & WQ_DYING)) |
271 | spin_unlock_irqrestore(&cwq->lock, flags); | 960 | return; |
961 | |||
962 | /* determine gcwq to use */ | ||
963 | if (!(wq->flags & WQ_UNBOUND)) { | ||
964 | struct global_cwq *last_gcwq; | ||
965 | |||
966 | if (unlikely(cpu == WORK_CPU_UNBOUND)) | ||
967 | cpu = raw_smp_processor_id(); | ||
968 | |||
969 | /* | ||
970 | * It's multi cpu. If @wq is non-reentrant and @work | ||
971 | * was previously on a different cpu, it might still | ||
972 | * be running there, in which case the work needs to | ||
973 | * be queued on that cpu to guarantee non-reentrance. | ||
974 | */ | ||
975 | gcwq = get_gcwq(cpu); | ||
976 | if (wq->flags & WQ_NON_REENTRANT && | ||
977 | (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { | ||
978 | struct worker *worker; | ||
979 | |||
980 | spin_lock_irqsave(&last_gcwq->lock, flags); | ||
981 | |||
982 | worker = find_worker_executing_work(last_gcwq, work); | ||
983 | |||
984 | if (worker && worker->current_cwq->wq == wq) | ||
985 | gcwq = last_gcwq; | ||
986 | else { | ||
987 | /* meh... not running there, queue here */ | ||
988 | spin_unlock_irqrestore(&last_gcwq->lock, flags); | ||
989 | spin_lock_irqsave(&gcwq->lock, flags); | ||
990 | } | ||
991 | } else | ||
992 | spin_lock_irqsave(&gcwq->lock, flags); | ||
993 | } else { | ||
994 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | ||
995 | spin_lock_irqsave(&gcwq->lock, flags); | ||
996 | } | ||
997 | |||
998 | /* gcwq determined, get cwq and queue */ | ||
999 | cwq = get_cwq(gcwq->cpu, wq); | ||
1000 | |||
1001 | BUG_ON(!list_empty(&work->entry)); | ||
1002 | |||
1003 | cwq->nr_in_flight[cwq->work_color]++; | ||
1004 | work_flags = work_color_to_flags(cwq->work_color); | ||
1005 | |||
1006 | if (likely(cwq->nr_active < cwq->max_active)) { | ||
1007 | cwq->nr_active++; | ||
1008 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | ||
1009 | } else { | ||
1010 | work_flags |= WORK_STRUCT_DELAYED; | ||
1011 | worklist = &cwq->delayed_works; | ||
1012 | } | ||
1013 | |||
1014 | insert_work(cwq, work, worklist, work_flags); | ||
1015 | |||
1016 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
272 | } | 1017 | } |
273 | 1018 | ||
274 | /** | 1019 | /** |
@@ -308,9 +1053,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) | |||
308 | { | 1053 | { |
309 | int ret = 0; | 1054 | int ret = 0; |
310 | 1055 | ||
311 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1056 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
312 | BUG_ON(!list_empty(&work->entry)); | 1057 | __queue_work(cpu, wq, work); |
313 | __queue_work(wq_per_cpu(wq, cpu), work); | ||
314 | ret = 1; | 1058 | ret = 1; |
315 | } | 1059 | } |
316 | return ret; | 1060 | return ret; |
@@ -320,10 +1064,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); | |||
320 | static void delayed_work_timer_fn(unsigned long __data) | 1064 | static void delayed_work_timer_fn(unsigned long __data) |
321 | { | 1065 | { |
322 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1066 | struct delayed_work *dwork = (struct delayed_work *)__data; |
323 | struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); | 1067 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); |
324 | struct workqueue_struct *wq = cwq->wq; | ||
325 | 1068 | ||
326 | __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); | 1069 | __queue_work(smp_processor_id(), cwq->wq, &dwork->work); |
327 | } | 1070 | } |
328 | 1071 | ||
329 | /** | 1072 | /** |
@@ -360,14 +1103,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
360 | struct timer_list *timer = &dwork->timer; | 1103 | struct timer_list *timer = &dwork->timer; |
361 | struct work_struct *work = &dwork->work; | 1104 | struct work_struct *work = &dwork->work; |
362 | 1105 | ||
363 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1106 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
1107 | unsigned int lcpu; | ||
1108 | |||
364 | BUG_ON(timer_pending(timer)); | 1109 | BUG_ON(timer_pending(timer)); |
365 | BUG_ON(!list_empty(&work->entry)); | 1110 | BUG_ON(!list_empty(&work->entry)); |
366 | 1111 | ||
367 | timer_stats_timer_set_start_info(&dwork->timer); | 1112 | timer_stats_timer_set_start_info(&dwork->timer); |
368 | 1113 | ||
369 | /* This stores cwq for the moment, for the timer_fn */ | 1114 | /* |
370 | set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); | 1115 | * This stores cwq for the moment, for the timer_fn. |
1116 | * Note that the work's gcwq is preserved to allow | ||
1117 | * reentrance detection for delayed works. | ||
1118 | */ | ||
1119 | if (!(wq->flags & WQ_UNBOUND)) { | ||
1120 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
1121 | |||
1122 | if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) | ||
1123 | lcpu = gcwq->cpu; | ||
1124 | else | ||
1125 | lcpu = raw_smp_processor_id(); | ||
1126 | } else | ||
1127 | lcpu = WORK_CPU_UNBOUND; | ||
1128 | |||
1129 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
1130 | |||
371 | timer->expires = jiffies + delay; | 1131 | timer->expires = jiffies + delay; |
372 | timer->data = (unsigned long)dwork; | 1132 | timer->data = (unsigned long)dwork; |
373 | timer->function = delayed_work_timer_fn; | 1133 | timer->function = delayed_work_timer_fn; |
@@ -382,80 +1142,888 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
382 | } | 1142 | } |
383 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1143 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); |
384 | 1144 | ||
385 | static void run_workqueue(struct cpu_workqueue_struct *cwq) | 1145 | /** |
1146 | * worker_enter_idle - enter idle state | ||
1147 | * @worker: worker which is entering idle state | ||
1148 | * | ||
1149 | * @worker is entering idle state. Update stats and idle timer if | ||
1150 | * necessary. | ||
1151 | * | ||
1152 | * LOCKING: | ||
1153 | * spin_lock_irq(gcwq->lock). | ||
1154 | */ | ||
1155 | static void worker_enter_idle(struct worker *worker) | ||
386 | { | 1156 | { |
387 | spin_lock_irq(&cwq->lock); | 1157 | struct global_cwq *gcwq = worker->gcwq; |
388 | while (!list_empty(&cwq->worklist)) { | 1158 | |
389 | struct work_struct *work = list_entry(cwq->worklist.next, | 1159 | BUG_ON(worker->flags & WORKER_IDLE); |
390 | struct work_struct, entry); | 1160 | BUG_ON(!list_empty(&worker->entry) && |
391 | work_func_t f = work->func; | 1161 | (worker->hentry.next || worker->hentry.pprev)); |
392 | #ifdef CONFIG_LOCKDEP | 1162 | |
1163 | /* can't use worker_set_flags(), also called from start_worker() */ | ||
1164 | worker->flags |= WORKER_IDLE; | ||
1165 | gcwq->nr_idle++; | ||
1166 | worker->last_active = jiffies; | ||
1167 | |||
1168 | /* idle_list is LIFO */ | ||
1169 | list_add(&worker->entry, &gcwq->idle_list); | ||
1170 | |||
1171 | if (likely(!(worker->flags & WORKER_ROGUE))) { | ||
1172 | if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) | ||
1173 | mod_timer(&gcwq->idle_timer, | ||
1174 | jiffies + IDLE_WORKER_TIMEOUT); | ||
1175 | } else | ||
1176 | wake_up_all(&gcwq->trustee_wait); | ||
1177 | |||
1178 | /* sanity check nr_running */ | ||
1179 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | ||
1180 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | ||
1181 | } | ||
1182 | |||
1183 | /** | ||
1184 | * worker_leave_idle - leave idle state | ||
1185 | * @worker: worker which is leaving idle state | ||
1186 | * | ||
1187 | * @worker is leaving idle state. Update stats. | ||
1188 | * | ||
1189 | * LOCKING: | ||
1190 | * spin_lock_irq(gcwq->lock). | ||
1191 | */ | ||
1192 | static void worker_leave_idle(struct worker *worker) | ||
1193 | { | ||
1194 | struct global_cwq *gcwq = worker->gcwq; | ||
1195 | |||
1196 | BUG_ON(!(worker->flags & WORKER_IDLE)); | ||
1197 | worker_clr_flags(worker, WORKER_IDLE); | ||
1198 | gcwq->nr_idle--; | ||
1199 | list_del_init(&worker->entry); | ||
1200 | } | ||
1201 | |||
1202 | /** | ||
1203 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq | ||
1204 | * @worker: self | ||
1205 | * | ||
1206 | * Works which are scheduled while the cpu is online must at least be | ||
1207 | * scheduled to a worker which is bound to the cpu so that if they are | ||
1208 | * flushed from cpu callbacks while cpu is going down, they are | ||
1209 | * guaranteed to execute on the cpu. | ||
1210 | * | ||
1211 | * This function is to be used by rogue workers and rescuers to bind | ||
1212 | * themselves to the target cpu and may race with cpu going down or | ||
1213 | * coming online. kthread_bind() can't be used because it may put the | ||
1214 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | ||
1215 | * verbatim as it's best effort and blocking and gcwq may be | ||
1216 | * [dis]associated in the meantime. | ||
1217 | * | ||
1218 | * This function tries set_cpus_allowed() and locks gcwq and verifies | ||
1219 | * the binding against GCWQ_DISASSOCIATED which is set during | ||
1220 | * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters | ||
1221 | * idle state or fetches works without dropping lock, it can guarantee | ||
1222 | * the scheduling requirement described in the first paragraph. | ||
1223 | * | ||
1224 | * CONTEXT: | ||
1225 | * Might sleep. Called without any lock but returns with gcwq->lock | ||
1226 | * held. | ||
1227 | * | ||
1228 | * RETURNS: | ||
1229 | * %true if the associated gcwq is online (@worker is successfully | ||
1230 | * bound), %false if offline. | ||
1231 | */ | ||
1232 | static bool worker_maybe_bind_and_lock(struct worker *worker) | ||
1233 | __acquires(&gcwq->lock) | ||
1234 | { | ||
1235 | struct global_cwq *gcwq = worker->gcwq; | ||
1236 | struct task_struct *task = worker->task; | ||
1237 | |||
1238 | while (true) { | ||
393 | /* | 1239 | /* |
394 | * It is permissible to free the struct work_struct | 1240 | * The following call may fail, succeed or succeed |
395 | * from inside the function that is called from it, | 1241 | * without actually migrating the task to the cpu if |
396 | * this we need to take into account for lockdep too. | 1242 | * it races with cpu hotunplug operation. Verify |
397 | * To avoid bogus "held lock freed" warnings as well | 1243 | * against GCWQ_DISASSOCIATED. |
398 | * as problems when looking into work->lockdep_map, | ||
399 | * make a copy and use that here. | ||
400 | */ | 1244 | */ |
401 | struct lockdep_map lockdep_map = work->lockdep_map; | 1245 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) |
402 | #endif | 1246 | set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); |
403 | trace_workqueue_execution(cwq->thread, work); | 1247 | |
404 | debug_work_deactivate(work); | 1248 | spin_lock_irq(&gcwq->lock); |
405 | cwq->current_work = work; | 1249 | if (gcwq->flags & GCWQ_DISASSOCIATED) |
406 | list_del_init(cwq->worklist.next); | 1250 | return false; |
407 | spin_unlock_irq(&cwq->lock); | 1251 | if (task_cpu(task) == gcwq->cpu && |
408 | 1252 | cpumask_equal(¤t->cpus_allowed, | |
409 | BUG_ON(get_wq_data(work) != cwq); | 1253 | get_cpu_mask(gcwq->cpu))) |
410 | work_clear_pending(work); | 1254 | return true; |
411 | lock_map_acquire(&cwq->wq->lockdep_map); | 1255 | spin_unlock_irq(&gcwq->lock); |
412 | lock_map_acquire(&lockdep_map); | 1256 | |
413 | f(work); | 1257 | /* CPU has come up inbetween, retry migration */ |
414 | lock_map_release(&lockdep_map); | 1258 | cpu_relax(); |
415 | lock_map_release(&cwq->wq->lockdep_map); | 1259 | } |
416 | 1260 | } | |
417 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 1261 | |
418 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | 1262 | /* |
419 | "%s/0x%08x/%d\n", | 1263 | * Function for worker->rebind_work used to rebind rogue busy workers |
420 | current->comm, preempt_count(), | 1264 | * to the associated cpu which is coming back online. This is |
421 | task_pid_nr(current)); | 1265 | * scheduled by cpu up but can race with other cpu hotplug operations |
422 | printk(KERN_ERR " last function: "); | 1266 | * and may be executed twice without intervening cpu down. |
423 | print_symbol("%s\n", (unsigned long)f); | 1267 | */ |
424 | debug_show_held_locks(current); | 1268 | static void worker_rebind_fn(struct work_struct *work) |
425 | dump_stack(); | 1269 | { |
1270 | struct worker *worker = container_of(work, struct worker, rebind_work); | ||
1271 | struct global_cwq *gcwq = worker->gcwq; | ||
1272 | |||
1273 | if (worker_maybe_bind_and_lock(worker)) | ||
1274 | worker_clr_flags(worker, WORKER_REBIND); | ||
1275 | |||
1276 | spin_unlock_irq(&gcwq->lock); | ||
1277 | } | ||
1278 | |||
1279 | static struct worker *alloc_worker(void) | ||
1280 | { | ||
1281 | struct worker *worker; | ||
1282 | |||
1283 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | ||
1284 | if (worker) { | ||
1285 | INIT_LIST_HEAD(&worker->entry); | ||
1286 | INIT_LIST_HEAD(&worker->scheduled); | ||
1287 | INIT_WORK(&worker->rebind_work, worker_rebind_fn); | ||
1288 | /* on creation a worker is in !idle && prep state */ | ||
1289 | worker->flags = WORKER_PREP; | ||
1290 | } | ||
1291 | return worker; | ||
1292 | } | ||
1293 | |||
1294 | /** | ||
1295 | * create_worker - create a new workqueue worker | ||
1296 | * @gcwq: gcwq the new worker will belong to | ||
1297 | * @bind: whether to set affinity to @cpu or not | ||
1298 | * | ||
1299 | * Create a new worker which is bound to @gcwq. The returned worker | ||
1300 | * can be started by calling start_worker() or destroyed using | ||
1301 | * destroy_worker(). | ||
1302 | * | ||
1303 | * CONTEXT: | ||
1304 | * Might sleep. Does GFP_KERNEL allocations. | ||
1305 | * | ||
1306 | * RETURNS: | ||
1307 | * Pointer to the newly created worker. | ||
1308 | */ | ||
1309 | static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | ||
1310 | { | ||
1311 | bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; | ||
1312 | struct worker *worker = NULL; | ||
1313 | int id = -1; | ||
1314 | |||
1315 | spin_lock_irq(&gcwq->lock); | ||
1316 | while (ida_get_new(&gcwq->worker_ida, &id)) { | ||
1317 | spin_unlock_irq(&gcwq->lock); | ||
1318 | if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) | ||
1319 | goto fail; | ||
1320 | spin_lock_irq(&gcwq->lock); | ||
1321 | } | ||
1322 | spin_unlock_irq(&gcwq->lock); | ||
1323 | |||
1324 | worker = alloc_worker(); | ||
1325 | if (!worker) | ||
1326 | goto fail; | ||
1327 | |||
1328 | worker->gcwq = gcwq; | ||
1329 | worker->id = id; | ||
1330 | |||
1331 | if (!on_unbound_cpu) | ||
1332 | worker->task = kthread_create(worker_thread, worker, | ||
1333 | "kworker/%u:%d", gcwq->cpu, id); | ||
1334 | else | ||
1335 | worker->task = kthread_create(worker_thread, worker, | ||
1336 | "kworker/u:%d", id); | ||
1337 | if (IS_ERR(worker->task)) | ||
1338 | goto fail; | ||
1339 | |||
1340 | /* | ||
1341 | * A rogue worker will become a regular one if CPU comes | ||
1342 | * online later on. Make sure every worker has | ||
1343 | * PF_THREAD_BOUND set. | ||
1344 | */ | ||
1345 | if (bind && !on_unbound_cpu) | ||
1346 | kthread_bind(worker->task, gcwq->cpu); | ||
1347 | else { | ||
1348 | worker->task->flags |= PF_THREAD_BOUND; | ||
1349 | if (on_unbound_cpu) | ||
1350 | worker->flags |= WORKER_UNBOUND; | ||
1351 | } | ||
1352 | |||
1353 | return worker; | ||
1354 | fail: | ||
1355 | if (id >= 0) { | ||
1356 | spin_lock_irq(&gcwq->lock); | ||
1357 | ida_remove(&gcwq->worker_ida, id); | ||
1358 | spin_unlock_irq(&gcwq->lock); | ||
1359 | } | ||
1360 | kfree(worker); | ||
1361 | return NULL; | ||
1362 | } | ||
1363 | |||
1364 | /** | ||
1365 | * start_worker - start a newly created worker | ||
1366 | * @worker: worker to start | ||
1367 | * | ||
1368 | * Make the gcwq aware of @worker and start it. | ||
1369 | * | ||
1370 | * CONTEXT: | ||
1371 | * spin_lock_irq(gcwq->lock). | ||
1372 | */ | ||
1373 | static void start_worker(struct worker *worker) | ||
1374 | { | ||
1375 | worker->flags |= WORKER_STARTED; | ||
1376 | worker->gcwq->nr_workers++; | ||
1377 | worker_enter_idle(worker); | ||
1378 | wake_up_process(worker->task); | ||
1379 | } | ||
1380 | |||
1381 | /** | ||
1382 | * destroy_worker - destroy a workqueue worker | ||
1383 | * @worker: worker to be destroyed | ||
1384 | * | ||
1385 | * Destroy @worker and adjust @gcwq stats accordingly. | ||
1386 | * | ||
1387 | * CONTEXT: | ||
1388 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
1389 | */ | ||
1390 | static void destroy_worker(struct worker *worker) | ||
1391 | { | ||
1392 | struct global_cwq *gcwq = worker->gcwq; | ||
1393 | int id = worker->id; | ||
1394 | |||
1395 | /* sanity check frenzy */ | ||
1396 | BUG_ON(worker->current_work); | ||
1397 | BUG_ON(!list_empty(&worker->scheduled)); | ||
1398 | |||
1399 | if (worker->flags & WORKER_STARTED) | ||
1400 | gcwq->nr_workers--; | ||
1401 | if (worker->flags & WORKER_IDLE) | ||
1402 | gcwq->nr_idle--; | ||
1403 | |||
1404 | list_del_init(&worker->entry); | ||
1405 | worker->flags |= WORKER_DIE; | ||
1406 | |||
1407 | spin_unlock_irq(&gcwq->lock); | ||
1408 | |||
1409 | kthread_stop(worker->task); | ||
1410 | kfree(worker); | ||
1411 | |||
1412 | spin_lock_irq(&gcwq->lock); | ||
1413 | ida_remove(&gcwq->worker_ida, id); | ||
1414 | } | ||
1415 | |||
1416 | static void idle_worker_timeout(unsigned long __gcwq) | ||
1417 | { | ||
1418 | struct global_cwq *gcwq = (void *)__gcwq; | ||
1419 | |||
1420 | spin_lock_irq(&gcwq->lock); | ||
1421 | |||
1422 | if (too_many_workers(gcwq)) { | ||
1423 | struct worker *worker; | ||
1424 | unsigned long expires; | ||
1425 | |||
1426 | /* idle_list is kept in LIFO order, check the last one */ | ||
1427 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | ||
1428 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
1429 | |||
1430 | if (time_before(jiffies, expires)) | ||
1431 | mod_timer(&gcwq->idle_timer, expires); | ||
1432 | else { | ||
1433 | /* it's been idle for too long, wake up manager */ | ||
1434 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
1435 | wake_up_worker(gcwq); | ||
426 | } | 1436 | } |
1437 | } | ||
427 | 1438 | ||
428 | spin_lock_irq(&cwq->lock); | 1439 | spin_unlock_irq(&gcwq->lock); |
429 | cwq->current_work = NULL; | 1440 | } |
1441 | |||
1442 | static bool send_mayday(struct work_struct *work) | ||
1443 | { | ||
1444 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
1445 | struct workqueue_struct *wq = cwq->wq; | ||
1446 | unsigned int cpu; | ||
1447 | |||
1448 | if (!(wq->flags & WQ_RESCUER)) | ||
1449 | return false; | ||
1450 | |||
1451 | /* mayday mayday mayday */ | ||
1452 | cpu = cwq->gcwq->cpu; | ||
1453 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | ||
1454 | if (cpu == WORK_CPU_UNBOUND) | ||
1455 | cpu = 0; | ||
1456 | if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) | ||
1457 | wake_up_process(wq->rescuer->task); | ||
1458 | return true; | ||
1459 | } | ||
1460 | |||
1461 | static void gcwq_mayday_timeout(unsigned long __gcwq) | ||
1462 | { | ||
1463 | struct global_cwq *gcwq = (void *)__gcwq; | ||
1464 | struct work_struct *work; | ||
1465 | |||
1466 | spin_lock_irq(&gcwq->lock); | ||
1467 | |||
1468 | if (need_to_create_worker(gcwq)) { | ||
1469 | /* | ||
1470 | * We've been trying to create a new worker but | ||
1471 | * haven't been successful. We might be hitting an | ||
1472 | * allocation deadlock. Send distress signals to | ||
1473 | * rescuers. | ||
1474 | */ | ||
1475 | list_for_each_entry(work, &gcwq->worklist, entry) | ||
1476 | send_mayday(work); | ||
430 | } | 1477 | } |
431 | spin_unlock_irq(&cwq->lock); | 1478 | |
1479 | spin_unlock_irq(&gcwq->lock); | ||
1480 | |||
1481 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); | ||
432 | } | 1482 | } |
433 | 1483 | ||
434 | static int worker_thread(void *__cwq) | 1484 | /** |
1485 | * maybe_create_worker - create a new worker if necessary | ||
1486 | * @gcwq: gcwq to create a new worker for | ||
1487 | * | ||
1488 | * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to | ||
1489 | * have at least one idle worker on return from this function. If | ||
1490 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is | ||
1491 | * sent to all rescuers with works scheduled on @gcwq to resolve | ||
1492 | * possible allocation deadlock. | ||
1493 | * | ||
1494 | * On return, need_to_create_worker() is guaranteed to be false and | ||
1495 | * may_start_working() true. | ||
1496 | * | ||
1497 | * LOCKING: | ||
1498 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1499 | * multiple times. Does GFP_KERNEL allocations. Called only from | ||
1500 | * manager. | ||
1501 | * | ||
1502 | * RETURNS: | ||
1503 | * false if no action was taken and gcwq->lock stayed locked, true | ||
1504 | * otherwise. | ||
1505 | */ | ||
1506 | static bool maybe_create_worker(struct global_cwq *gcwq) | ||
1507 | __releases(&gcwq->lock) | ||
1508 | __acquires(&gcwq->lock) | ||
435 | { | 1509 | { |
436 | struct cpu_workqueue_struct *cwq = __cwq; | 1510 | if (!need_to_create_worker(gcwq)) |
437 | DEFINE_WAIT(wait); | 1511 | return false; |
1512 | restart: | ||
1513 | spin_unlock_irq(&gcwq->lock); | ||
1514 | |||
1515 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | ||
1516 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | ||
1517 | |||
1518 | while (true) { | ||
1519 | struct worker *worker; | ||
1520 | |||
1521 | worker = create_worker(gcwq, true); | ||
1522 | if (worker) { | ||
1523 | del_timer_sync(&gcwq->mayday_timer); | ||
1524 | spin_lock_irq(&gcwq->lock); | ||
1525 | start_worker(worker); | ||
1526 | BUG_ON(need_to_create_worker(gcwq)); | ||
1527 | return true; | ||
1528 | } | ||
1529 | |||
1530 | if (!need_to_create_worker(gcwq)) | ||
1531 | break; | ||
438 | 1532 | ||
439 | if (cwq->wq->freezeable) | 1533 | __set_current_state(TASK_INTERRUPTIBLE); |
440 | set_freezable(); | 1534 | schedule_timeout(CREATE_COOLDOWN); |
441 | 1535 | ||
442 | for (;;) { | 1536 | if (!need_to_create_worker(gcwq)) |
443 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); | 1537 | break; |
444 | if (!freezing(current) && | 1538 | } |
445 | !kthread_should_stop() && | 1539 | |
446 | list_empty(&cwq->worklist)) | 1540 | del_timer_sync(&gcwq->mayday_timer); |
447 | schedule(); | 1541 | spin_lock_irq(&gcwq->lock); |
448 | finish_wait(&cwq->more_work, &wait); | 1542 | if (need_to_create_worker(gcwq)) |
1543 | goto restart; | ||
1544 | return true; | ||
1545 | } | ||
1546 | |||
1547 | /** | ||
1548 | * maybe_destroy_worker - destroy workers which have been idle for a while | ||
1549 | * @gcwq: gcwq to destroy workers for | ||
1550 | * | ||
1551 | * Destroy @gcwq workers which have been idle for longer than | ||
1552 | * IDLE_WORKER_TIMEOUT. | ||
1553 | * | ||
1554 | * LOCKING: | ||
1555 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1556 | * multiple times. Called only from manager. | ||
1557 | * | ||
1558 | * RETURNS: | ||
1559 | * false if no action was taken and gcwq->lock stayed locked, true | ||
1560 | * otherwise. | ||
1561 | */ | ||
1562 | static bool maybe_destroy_workers(struct global_cwq *gcwq) | ||
1563 | { | ||
1564 | bool ret = false; | ||
449 | 1565 | ||
450 | try_to_freeze(); | 1566 | while (too_many_workers(gcwq)) { |
1567 | struct worker *worker; | ||
1568 | unsigned long expires; | ||
451 | 1569 | ||
452 | if (kthread_should_stop()) | 1570 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); |
1571 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
1572 | |||
1573 | if (time_before(jiffies, expires)) { | ||
1574 | mod_timer(&gcwq->idle_timer, expires); | ||
453 | break; | 1575 | break; |
1576 | } | ||
454 | 1577 | ||
455 | run_workqueue(cwq); | 1578 | destroy_worker(worker); |
1579 | ret = true; | ||
456 | } | 1580 | } |
457 | 1581 | ||
458 | return 0; | 1582 | return ret; |
1583 | } | ||
1584 | |||
1585 | /** | ||
1586 | * manage_workers - manage worker pool | ||
1587 | * @worker: self | ||
1588 | * | ||
1589 | * Assume the manager role and manage gcwq worker pool @worker belongs | ||
1590 | * to. At any given time, there can be only zero or one manager per | ||
1591 | * gcwq. The exclusion is handled automatically by this function. | ||
1592 | * | ||
1593 | * The caller can safely start processing works on false return. On | ||
1594 | * true return, it's guaranteed that need_to_create_worker() is false | ||
1595 | * and may_start_working() is true. | ||
1596 | * | ||
1597 | * CONTEXT: | ||
1598 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1599 | * multiple times. Does GFP_KERNEL allocations. | ||
1600 | * | ||
1601 | * RETURNS: | ||
1602 | * false if no action was taken and gcwq->lock stayed locked, true if | ||
1603 | * some action was taken. | ||
1604 | */ | ||
1605 | static bool manage_workers(struct worker *worker) | ||
1606 | { | ||
1607 | struct global_cwq *gcwq = worker->gcwq; | ||
1608 | bool ret = false; | ||
1609 | |||
1610 | if (gcwq->flags & GCWQ_MANAGING_WORKERS) | ||
1611 | return ret; | ||
1612 | |||
1613 | gcwq->flags &= ~GCWQ_MANAGE_WORKERS; | ||
1614 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
1615 | |||
1616 | /* | ||
1617 | * Destroy and then create so that may_start_working() is true | ||
1618 | * on return. | ||
1619 | */ | ||
1620 | ret |= maybe_destroy_workers(gcwq); | ||
1621 | ret |= maybe_create_worker(gcwq); | ||
1622 | |||
1623 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
1624 | |||
1625 | /* | ||
1626 | * The trustee might be waiting to take over the manager | ||
1627 | * position, tell it we're done. | ||
1628 | */ | ||
1629 | if (unlikely(gcwq->trustee)) | ||
1630 | wake_up_all(&gcwq->trustee_wait); | ||
1631 | |||
1632 | return ret; | ||
1633 | } | ||
1634 | |||
1635 | /** | ||
1636 | * move_linked_works - move linked works to a list | ||
1637 | * @work: start of series of works to be scheduled | ||
1638 | * @head: target list to append @work to | ||
1639 | * @nextp: out paramter for nested worklist walking | ||
1640 | * | ||
1641 | * Schedule linked works starting from @work to @head. Work series to | ||
1642 | * be scheduled starts at @work and includes any consecutive work with | ||
1643 | * WORK_STRUCT_LINKED set in its predecessor. | ||
1644 | * | ||
1645 | * If @nextp is not NULL, it's updated to point to the next work of | ||
1646 | * the last scheduled work. This allows move_linked_works() to be | ||
1647 | * nested inside outer list_for_each_entry_safe(). | ||
1648 | * | ||
1649 | * CONTEXT: | ||
1650 | * spin_lock_irq(gcwq->lock). | ||
1651 | */ | ||
1652 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
1653 | struct work_struct **nextp) | ||
1654 | { | ||
1655 | struct work_struct *n; | ||
1656 | |||
1657 | /* | ||
1658 | * Linked worklist will always end before the end of the list, | ||
1659 | * use NULL for list head. | ||
1660 | */ | ||
1661 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
1662 | list_move_tail(&work->entry, head); | ||
1663 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
1664 | break; | ||
1665 | } | ||
1666 | |||
1667 | /* | ||
1668 | * If we're already inside safe list traversal and have moved | ||
1669 | * multiple works to the scheduled queue, the next position | ||
1670 | * needs to be updated. | ||
1671 | */ | ||
1672 | if (nextp) | ||
1673 | *nextp = n; | ||
1674 | } | ||
1675 | |||
1676 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
1677 | { | ||
1678 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
1679 | struct work_struct, entry); | ||
1680 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | ||
1681 | |||
1682 | move_linked_works(work, pos, NULL); | ||
1683 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
1684 | cwq->nr_active++; | ||
1685 | } | ||
1686 | |||
1687 | /** | ||
1688 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
1689 | * @cwq: cwq of interest | ||
1690 | * @color: color of work which left the queue | ||
1691 | * @delayed: for a delayed work | ||
1692 | * | ||
1693 | * A work either has completed or is removed from pending queue, | ||
1694 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
1695 | * | ||
1696 | * CONTEXT: | ||
1697 | * spin_lock_irq(gcwq->lock). | ||
1698 | */ | ||
1699 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, | ||
1700 | bool delayed) | ||
1701 | { | ||
1702 | /* ignore uncolored works */ | ||
1703 | if (color == WORK_NO_COLOR) | ||
1704 | return; | ||
1705 | |||
1706 | cwq->nr_in_flight[color]--; | ||
1707 | |||
1708 | if (!delayed) { | ||
1709 | cwq->nr_active--; | ||
1710 | if (!list_empty(&cwq->delayed_works)) { | ||
1711 | /* one down, submit a delayed one */ | ||
1712 | if (cwq->nr_active < cwq->max_active) | ||
1713 | cwq_activate_first_delayed(cwq); | ||
1714 | } | ||
1715 | } | ||
1716 | |||
1717 | /* is flush in progress and are we at the flushing tip? */ | ||
1718 | if (likely(cwq->flush_color != color)) | ||
1719 | return; | ||
1720 | |||
1721 | /* are there still in-flight works? */ | ||
1722 | if (cwq->nr_in_flight[color]) | ||
1723 | return; | ||
1724 | |||
1725 | /* this cwq is done, clear flush_color */ | ||
1726 | cwq->flush_color = -1; | ||
1727 | |||
1728 | /* | ||
1729 | * If this was the last cwq, wake up the first flusher. It | ||
1730 | * will handle the rest. | ||
1731 | */ | ||
1732 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
1733 | complete(&cwq->wq->first_flusher->done); | ||
1734 | } | ||
1735 | |||
1736 | /** | ||
1737 | * process_one_work - process single work | ||
1738 | * @worker: self | ||
1739 | * @work: work to process | ||
1740 | * | ||
1741 | * Process @work. This function contains all the logics necessary to | ||
1742 | * process a single work including synchronization against and | ||
1743 | * interaction with other workers on the same cpu, queueing and | ||
1744 | * flushing. As long as context requirement is met, any worker can | ||
1745 | * call this function to process a work. | ||
1746 | * | ||
1747 | * CONTEXT: | ||
1748 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
1749 | */ | ||
1750 | static void process_one_work(struct worker *worker, struct work_struct *work) | ||
1751 | __releases(&gcwq->lock) | ||
1752 | __acquires(&gcwq->lock) | ||
1753 | { | ||
1754 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
1755 | struct global_cwq *gcwq = cwq->gcwq; | ||
1756 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | ||
1757 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | ||
1758 | work_func_t f = work->func; | ||
1759 | int work_color; | ||
1760 | struct worker *collision; | ||
1761 | #ifdef CONFIG_LOCKDEP | ||
1762 | /* | ||
1763 | * It is permissible to free the struct work_struct from | ||
1764 | * inside the function that is called from it, this we need to | ||
1765 | * take into account for lockdep too. To avoid bogus "held | ||
1766 | * lock freed" warnings as well as problems when looking into | ||
1767 | * work->lockdep_map, make a copy and use that here. | ||
1768 | */ | ||
1769 | struct lockdep_map lockdep_map = work->lockdep_map; | ||
1770 | #endif | ||
1771 | /* | ||
1772 | * A single work shouldn't be executed concurrently by | ||
1773 | * multiple workers on a single cpu. Check whether anyone is | ||
1774 | * already processing the work. If so, defer the work to the | ||
1775 | * currently executing one. | ||
1776 | */ | ||
1777 | collision = __find_worker_executing_work(gcwq, bwh, work); | ||
1778 | if (unlikely(collision)) { | ||
1779 | move_linked_works(work, &collision->scheduled, NULL); | ||
1780 | return; | ||
1781 | } | ||
1782 | |||
1783 | /* claim and process */ | ||
1784 | debug_work_deactivate(work); | ||
1785 | hlist_add_head(&worker->hentry, bwh); | ||
1786 | worker->current_work = work; | ||
1787 | worker->current_cwq = cwq; | ||
1788 | work_color = get_work_color(work); | ||
1789 | |||
1790 | /* record the current cpu number in the work data and dequeue */ | ||
1791 | set_work_cpu(work, gcwq->cpu); | ||
1792 | list_del_init(&work->entry); | ||
1793 | |||
1794 | /* | ||
1795 | * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, | ||
1796 | * wake up another worker; otherwise, clear HIGHPRI_PENDING. | ||
1797 | */ | ||
1798 | if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { | ||
1799 | struct work_struct *nwork = list_first_entry(&gcwq->worklist, | ||
1800 | struct work_struct, entry); | ||
1801 | |||
1802 | if (!list_empty(&gcwq->worklist) && | ||
1803 | get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) | ||
1804 | wake_up_worker(gcwq); | ||
1805 | else | ||
1806 | gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; | ||
1807 | } | ||
1808 | |||
1809 | /* | ||
1810 | * CPU intensive works don't participate in concurrency | ||
1811 | * management. They're the scheduler's responsibility. | ||
1812 | */ | ||
1813 | if (unlikely(cpu_intensive)) | ||
1814 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | ||
1815 | |||
1816 | spin_unlock_irq(&gcwq->lock); | ||
1817 | |||
1818 | work_clear_pending(work); | ||
1819 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
1820 | lock_map_acquire(&lockdep_map); | ||
1821 | trace_workqueue_execute_start(work); | ||
1822 | f(work); | ||
1823 | /* | ||
1824 | * While we must be careful to not use "work" after this, the trace | ||
1825 | * point will only record its address. | ||
1826 | */ | ||
1827 | trace_workqueue_execute_end(work); | ||
1828 | lock_map_release(&lockdep_map); | ||
1829 | lock_map_release(&cwq->wq->lockdep_map); | ||
1830 | |||
1831 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | ||
1832 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | ||
1833 | "%s/0x%08x/%d\n", | ||
1834 | current->comm, preempt_count(), task_pid_nr(current)); | ||
1835 | printk(KERN_ERR " last function: "); | ||
1836 | print_symbol("%s\n", (unsigned long)f); | ||
1837 | debug_show_held_locks(current); | ||
1838 | dump_stack(); | ||
1839 | } | ||
1840 | |||
1841 | spin_lock_irq(&gcwq->lock); | ||
1842 | |||
1843 | /* clear cpu intensive status */ | ||
1844 | if (unlikely(cpu_intensive)) | ||
1845 | worker_clr_flags(worker, WORKER_CPU_INTENSIVE); | ||
1846 | |||
1847 | /* we're done with it, release */ | ||
1848 | hlist_del_init(&worker->hentry); | ||
1849 | worker->current_work = NULL; | ||
1850 | worker->current_cwq = NULL; | ||
1851 | cwq_dec_nr_in_flight(cwq, work_color, false); | ||
1852 | } | ||
1853 | |||
1854 | /** | ||
1855 | * process_scheduled_works - process scheduled works | ||
1856 | * @worker: self | ||
1857 | * | ||
1858 | * Process all scheduled works. Please note that the scheduled list | ||
1859 | * may change while processing a work, so this function repeatedly | ||
1860 | * fetches a work from the top and executes it. | ||
1861 | * | ||
1862 | * CONTEXT: | ||
1863 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
1864 | * multiple times. | ||
1865 | */ | ||
1866 | static void process_scheduled_works(struct worker *worker) | ||
1867 | { | ||
1868 | while (!list_empty(&worker->scheduled)) { | ||
1869 | struct work_struct *work = list_first_entry(&worker->scheduled, | ||
1870 | struct work_struct, entry); | ||
1871 | process_one_work(worker, work); | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1875 | /** | ||
1876 | * worker_thread - the worker thread function | ||
1877 | * @__worker: self | ||
1878 | * | ||
1879 | * The gcwq worker thread function. There's a single dynamic pool of | ||
1880 | * these per each cpu. These workers process all works regardless of | ||
1881 | * their specific target workqueue. The only exception is works which | ||
1882 | * belong to workqueues with a rescuer which will be explained in | ||
1883 | * rescuer_thread(). | ||
1884 | */ | ||
1885 | static int worker_thread(void *__worker) | ||
1886 | { | ||
1887 | struct worker *worker = __worker; | ||
1888 | struct global_cwq *gcwq = worker->gcwq; | ||
1889 | |||
1890 | /* tell the scheduler that this is a workqueue worker */ | ||
1891 | worker->task->flags |= PF_WQ_WORKER; | ||
1892 | woke_up: | ||
1893 | spin_lock_irq(&gcwq->lock); | ||
1894 | |||
1895 | /* DIE can be set only while we're idle, checking here is enough */ | ||
1896 | if (worker->flags & WORKER_DIE) { | ||
1897 | spin_unlock_irq(&gcwq->lock); | ||
1898 | worker->task->flags &= ~PF_WQ_WORKER; | ||
1899 | return 0; | ||
1900 | } | ||
1901 | |||
1902 | worker_leave_idle(worker); | ||
1903 | recheck: | ||
1904 | /* no more worker necessary? */ | ||
1905 | if (!need_more_worker(gcwq)) | ||
1906 | goto sleep; | ||
1907 | |||
1908 | /* do we need to manage? */ | ||
1909 | if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) | ||
1910 | goto recheck; | ||
1911 | |||
1912 | /* | ||
1913 | * ->scheduled list can only be filled while a worker is | ||
1914 | * preparing to process a work or actually processing it. | ||
1915 | * Make sure nobody diddled with it while I was sleeping. | ||
1916 | */ | ||
1917 | BUG_ON(!list_empty(&worker->scheduled)); | ||
1918 | |||
1919 | /* | ||
1920 | * When control reaches this point, we're guaranteed to have | ||
1921 | * at least one idle worker or that someone else has already | ||
1922 | * assumed the manager role. | ||
1923 | */ | ||
1924 | worker_clr_flags(worker, WORKER_PREP); | ||
1925 | |||
1926 | do { | ||
1927 | struct work_struct *work = | ||
1928 | list_first_entry(&gcwq->worklist, | ||
1929 | struct work_struct, entry); | ||
1930 | |||
1931 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { | ||
1932 | /* optimization path, not strictly necessary */ | ||
1933 | process_one_work(worker, work); | ||
1934 | if (unlikely(!list_empty(&worker->scheduled))) | ||
1935 | process_scheduled_works(worker); | ||
1936 | } else { | ||
1937 | move_linked_works(work, &worker->scheduled, NULL); | ||
1938 | process_scheduled_works(worker); | ||
1939 | } | ||
1940 | } while (keep_working(gcwq)); | ||
1941 | |||
1942 | worker_set_flags(worker, WORKER_PREP, false); | ||
1943 | sleep: | ||
1944 | if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) | ||
1945 | goto recheck; | ||
1946 | |||
1947 | /* | ||
1948 | * gcwq->lock is held and there's no work to process and no | ||
1949 | * need to manage, sleep. Workers are woken up only while | ||
1950 | * holding gcwq->lock or from local cpu, so setting the | ||
1951 | * current state before releasing gcwq->lock is enough to | ||
1952 | * prevent losing any event. | ||
1953 | */ | ||
1954 | worker_enter_idle(worker); | ||
1955 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1956 | spin_unlock_irq(&gcwq->lock); | ||
1957 | schedule(); | ||
1958 | goto woke_up; | ||
1959 | } | ||
1960 | |||
1961 | /** | ||
1962 | * rescuer_thread - the rescuer thread function | ||
1963 | * @__wq: the associated workqueue | ||
1964 | * | ||
1965 | * Workqueue rescuer thread function. There's one rescuer for each | ||
1966 | * workqueue which has WQ_RESCUER set. | ||
1967 | * | ||
1968 | * Regular work processing on a gcwq may block trying to create a new | ||
1969 | * worker which uses GFP_KERNEL allocation which has slight chance of | ||
1970 | * developing into deadlock if some works currently on the same queue | ||
1971 | * need to be processed to satisfy the GFP_KERNEL allocation. This is | ||
1972 | * the problem rescuer solves. | ||
1973 | * | ||
1974 | * When such condition is possible, the gcwq summons rescuers of all | ||
1975 | * workqueues which have works queued on the gcwq and let them process | ||
1976 | * those works so that forward progress can be guaranteed. | ||
1977 | * | ||
1978 | * This should happen rarely. | ||
1979 | */ | ||
1980 | static int rescuer_thread(void *__wq) | ||
1981 | { | ||
1982 | struct workqueue_struct *wq = __wq; | ||
1983 | struct worker *rescuer = wq->rescuer; | ||
1984 | struct list_head *scheduled = &rescuer->scheduled; | ||
1985 | bool is_unbound = wq->flags & WQ_UNBOUND; | ||
1986 | unsigned int cpu; | ||
1987 | |||
1988 | set_user_nice(current, RESCUER_NICE_LEVEL); | ||
1989 | repeat: | ||
1990 | set_current_state(TASK_INTERRUPTIBLE); | ||
1991 | |||
1992 | if (kthread_should_stop()) | ||
1993 | return 0; | ||
1994 | |||
1995 | /* | ||
1996 | * See whether any cpu is asking for help. Unbounded | ||
1997 | * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. | ||
1998 | */ | ||
1999 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | ||
2000 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | ||
2001 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | ||
2002 | struct global_cwq *gcwq = cwq->gcwq; | ||
2003 | struct work_struct *work, *n; | ||
2004 | |||
2005 | __set_current_state(TASK_RUNNING); | ||
2006 | mayday_clear_cpu(cpu, wq->mayday_mask); | ||
2007 | |||
2008 | /* migrate to the target cpu if possible */ | ||
2009 | rescuer->gcwq = gcwq; | ||
2010 | worker_maybe_bind_and_lock(rescuer); | ||
2011 | |||
2012 | /* | ||
2013 | * Slurp in all works issued via this workqueue and | ||
2014 | * process'em. | ||
2015 | */ | ||
2016 | BUG_ON(!list_empty(&rescuer->scheduled)); | ||
2017 | list_for_each_entry_safe(work, n, &gcwq->worklist, entry) | ||
2018 | if (get_work_cwq(work) == cwq) | ||
2019 | move_linked_works(work, scheduled, &n); | ||
2020 | |||
2021 | process_scheduled_works(rescuer); | ||
2022 | spin_unlock_irq(&gcwq->lock); | ||
2023 | } | ||
2024 | |||
2025 | schedule(); | ||
2026 | goto repeat; | ||
459 | } | 2027 | } |
460 | 2028 | ||
461 | struct wq_barrier { | 2029 | struct wq_barrier { |
@@ -469,44 +2037,137 @@ static void wq_barrier_func(struct work_struct *work) | |||
469 | complete(&barr->done); | 2037 | complete(&barr->done); |
470 | } | 2038 | } |
471 | 2039 | ||
2040 | /** | ||
2041 | * insert_wq_barrier - insert a barrier work | ||
2042 | * @cwq: cwq to insert barrier into | ||
2043 | * @barr: wq_barrier to insert | ||
2044 | * @target: target work to attach @barr to | ||
2045 | * @worker: worker currently executing @target, NULL if @target is not executing | ||
2046 | * | ||
2047 | * @barr is linked to @target such that @barr is completed only after | ||
2048 | * @target finishes execution. Please note that the ordering | ||
2049 | * guarantee is observed only with respect to @target and on the local | ||
2050 | * cpu. | ||
2051 | * | ||
2052 | * Currently, a queued barrier can't be canceled. This is because | ||
2053 | * try_to_grab_pending() can't determine whether the work to be | ||
2054 | * grabbed is at the head of the queue and thus can't clear LINKED | ||
2055 | * flag of the previous work while there must be a valid next work | ||
2056 | * after a work with LINKED flag set. | ||
2057 | * | ||
2058 | * Note that when @worker is non-NULL, @target may be modified | ||
2059 | * underneath us, so we can't reliably determine cwq from @target. | ||
2060 | * | ||
2061 | * CONTEXT: | ||
2062 | * spin_lock_irq(gcwq->lock). | ||
2063 | */ | ||
472 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | 2064 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, |
473 | struct wq_barrier *barr, struct list_head *head) | 2065 | struct wq_barrier *barr, |
2066 | struct work_struct *target, struct worker *worker) | ||
474 | { | 2067 | { |
2068 | struct list_head *head; | ||
2069 | unsigned int linked = 0; | ||
2070 | |||
475 | /* | 2071 | /* |
476 | * debugobject calls are safe here even with cwq->lock locked | 2072 | * debugobject calls are safe here even with gcwq->lock locked |
477 | * as we know for sure that this will not trigger any of the | 2073 | * as we know for sure that this will not trigger any of the |
478 | * checks and call back into the fixup functions where we | 2074 | * checks and call back into the fixup functions where we |
479 | * might deadlock. | 2075 | * might deadlock. |
480 | */ | 2076 | */ |
481 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); | 2077 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); |
482 | __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); | 2078 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
483 | |||
484 | init_completion(&barr->done); | 2079 | init_completion(&barr->done); |
485 | 2080 | ||
2081 | /* | ||
2082 | * If @target is currently being executed, schedule the | ||
2083 | * barrier to the worker; otherwise, put it after @target. | ||
2084 | */ | ||
2085 | if (worker) | ||
2086 | head = worker->scheduled.next; | ||
2087 | else { | ||
2088 | unsigned long *bits = work_data_bits(target); | ||
2089 | |||
2090 | head = target->entry.next; | ||
2091 | /* there can already be other linked works, inherit and set */ | ||
2092 | linked = *bits & WORK_STRUCT_LINKED; | ||
2093 | __set_bit(WORK_STRUCT_LINKED_BIT, bits); | ||
2094 | } | ||
2095 | |||
486 | debug_work_activate(&barr->work); | 2096 | debug_work_activate(&barr->work); |
487 | insert_work(cwq, &barr->work, head); | 2097 | insert_work(cwq, &barr->work, head, |
2098 | work_color_to_flags(WORK_NO_COLOR) | linked); | ||
488 | } | 2099 | } |
489 | 2100 | ||
490 | static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | 2101 | /** |
2102 | * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing | ||
2103 | * @wq: workqueue being flushed | ||
2104 | * @flush_color: new flush color, < 0 for no-op | ||
2105 | * @work_color: new work color, < 0 for no-op | ||
2106 | * | ||
2107 | * Prepare cwqs for workqueue flushing. | ||
2108 | * | ||
2109 | * If @flush_color is non-negative, flush_color on all cwqs should be | ||
2110 | * -1. If no cwq has in-flight commands at the specified color, all | ||
2111 | * cwq->flush_color's stay at -1 and %false is returned. If any cwq | ||
2112 | * has in flight commands, its cwq->flush_color is set to | ||
2113 | * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq | ||
2114 | * wakeup logic is armed and %true is returned. | ||
2115 | * | ||
2116 | * The caller should have initialized @wq->first_flusher prior to | ||
2117 | * calling this function with non-negative @flush_color. If | ||
2118 | * @flush_color is negative, no flush color update is done and %false | ||
2119 | * is returned. | ||
2120 | * | ||
2121 | * If @work_color is non-negative, all cwqs should have the same | ||
2122 | * work_color which is previous to @work_color and all will be | ||
2123 | * advanced to @work_color. | ||
2124 | * | ||
2125 | * CONTEXT: | ||
2126 | * mutex_lock(wq->flush_mutex). | ||
2127 | * | ||
2128 | * RETURNS: | ||
2129 | * %true if @flush_color >= 0 and there's something to flush. %false | ||
2130 | * otherwise. | ||
2131 | */ | ||
2132 | static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | ||
2133 | int flush_color, int work_color) | ||
491 | { | 2134 | { |
492 | int active = 0; | 2135 | bool wait = false; |
493 | struct wq_barrier barr; | 2136 | unsigned int cpu; |
494 | |||
495 | WARN_ON(cwq->thread == current); | ||
496 | 2137 | ||
497 | spin_lock_irq(&cwq->lock); | 2138 | if (flush_color >= 0) { |
498 | if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { | 2139 | BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); |
499 | insert_wq_barrier(cwq, &barr, &cwq->worklist); | 2140 | atomic_set(&wq->nr_cwqs_to_flush, 1); |
500 | active = 1; | ||
501 | } | 2141 | } |
502 | spin_unlock_irq(&cwq->lock); | ||
503 | 2142 | ||
504 | if (active) { | 2143 | for_each_cwq_cpu(cpu, wq) { |
505 | wait_for_completion(&barr.done); | 2144 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
506 | destroy_work_on_stack(&barr.work); | 2145 | struct global_cwq *gcwq = cwq->gcwq; |
2146 | |||
2147 | spin_lock_irq(&gcwq->lock); | ||
2148 | |||
2149 | if (flush_color >= 0) { | ||
2150 | BUG_ON(cwq->flush_color != -1); | ||
2151 | |||
2152 | if (cwq->nr_in_flight[flush_color]) { | ||
2153 | cwq->flush_color = flush_color; | ||
2154 | atomic_inc(&wq->nr_cwqs_to_flush); | ||
2155 | wait = true; | ||
2156 | } | ||
2157 | } | ||
2158 | |||
2159 | if (work_color >= 0) { | ||
2160 | BUG_ON(work_color != work_next_color(cwq->work_color)); | ||
2161 | cwq->work_color = work_color; | ||
2162 | } | ||
2163 | |||
2164 | spin_unlock_irq(&gcwq->lock); | ||
507 | } | 2165 | } |
508 | 2166 | ||
509 | return active; | 2167 | if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) |
2168 | complete(&wq->first_flusher->done); | ||
2169 | |||
2170 | return wait; | ||
510 | } | 2171 | } |
511 | 2172 | ||
512 | /** | 2173 | /** |
@@ -518,20 +2179,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | |||
518 | * | 2179 | * |
519 | * We sleep until all works which were queued on entry have been handled, | 2180 | * We sleep until all works which were queued on entry have been handled, |
520 | * but we are not livelocked by new incoming ones. | 2181 | * but we are not livelocked by new incoming ones. |
521 | * | ||
522 | * This function used to run the workqueues itself. Now we just wait for the | ||
523 | * helper threads to do it. | ||
524 | */ | 2182 | */ |
525 | void flush_workqueue(struct workqueue_struct *wq) | 2183 | void flush_workqueue(struct workqueue_struct *wq) |
526 | { | 2184 | { |
527 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2185 | struct wq_flusher this_flusher = { |
528 | int cpu; | 2186 | .list = LIST_HEAD_INIT(this_flusher.list), |
2187 | .flush_color = -1, | ||
2188 | .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), | ||
2189 | }; | ||
2190 | int next_color; | ||
529 | 2191 | ||
530 | might_sleep(); | ||
531 | lock_map_acquire(&wq->lockdep_map); | 2192 | lock_map_acquire(&wq->lockdep_map); |
532 | lock_map_release(&wq->lockdep_map); | 2193 | lock_map_release(&wq->lockdep_map); |
533 | for_each_cpu(cpu, cpu_map) | 2194 | |
534 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); | 2195 | mutex_lock(&wq->flush_mutex); |
2196 | |||
2197 | /* | ||
2198 | * Start-to-wait phase | ||
2199 | */ | ||
2200 | next_color = work_next_color(wq->work_color); | ||
2201 | |||
2202 | if (next_color != wq->flush_color) { | ||
2203 | /* | ||
2204 | * Color space is not full. The current work_color | ||
2205 | * becomes our flush_color and work_color is advanced | ||
2206 | * by one. | ||
2207 | */ | ||
2208 | BUG_ON(!list_empty(&wq->flusher_overflow)); | ||
2209 | this_flusher.flush_color = wq->work_color; | ||
2210 | wq->work_color = next_color; | ||
2211 | |||
2212 | if (!wq->first_flusher) { | ||
2213 | /* no flush in progress, become the first flusher */ | ||
2214 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
2215 | |||
2216 | wq->first_flusher = &this_flusher; | ||
2217 | |||
2218 | if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, | ||
2219 | wq->work_color)) { | ||
2220 | /* nothing to flush, done */ | ||
2221 | wq->flush_color = next_color; | ||
2222 | wq->first_flusher = NULL; | ||
2223 | goto out_unlock; | ||
2224 | } | ||
2225 | } else { | ||
2226 | /* wait in queue */ | ||
2227 | BUG_ON(wq->flush_color == this_flusher.flush_color); | ||
2228 | list_add_tail(&this_flusher.list, &wq->flusher_queue); | ||
2229 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
2230 | } | ||
2231 | } else { | ||
2232 | /* | ||
2233 | * Oops, color space is full, wait on overflow queue. | ||
2234 | * The next flush completion will assign us | ||
2235 | * flush_color and transfer to flusher_queue. | ||
2236 | */ | ||
2237 | list_add_tail(&this_flusher.list, &wq->flusher_overflow); | ||
2238 | } | ||
2239 | |||
2240 | mutex_unlock(&wq->flush_mutex); | ||
2241 | |||
2242 | wait_for_completion(&this_flusher.done); | ||
2243 | |||
2244 | /* | ||
2245 | * Wake-up-and-cascade phase | ||
2246 | * | ||
2247 | * First flushers are responsible for cascading flushes and | ||
2248 | * handling overflow. Non-first flushers can simply return. | ||
2249 | */ | ||
2250 | if (wq->first_flusher != &this_flusher) | ||
2251 | return; | ||
2252 | |||
2253 | mutex_lock(&wq->flush_mutex); | ||
2254 | |||
2255 | /* we might have raced, check again with mutex held */ | ||
2256 | if (wq->first_flusher != &this_flusher) | ||
2257 | goto out_unlock; | ||
2258 | |||
2259 | wq->first_flusher = NULL; | ||
2260 | |||
2261 | BUG_ON(!list_empty(&this_flusher.list)); | ||
2262 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
2263 | |||
2264 | while (true) { | ||
2265 | struct wq_flusher *next, *tmp; | ||
2266 | |||
2267 | /* complete all the flushers sharing the current flush color */ | ||
2268 | list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { | ||
2269 | if (next->flush_color != wq->flush_color) | ||
2270 | break; | ||
2271 | list_del_init(&next->list); | ||
2272 | complete(&next->done); | ||
2273 | } | ||
2274 | |||
2275 | BUG_ON(!list_empty(&wq->flusher_overflow) && | ||
2276 | wq->flush_color != work_next_color(wq->work_color)); | ||
2277 | |||
2278 | /* this flush_color is finished, advance by one */ | ||
2279 | wq->flush_color = work_next_color(wq->flush_color); | ||
2280 | |||
2281 | /* one color has been freed, handle overflow queue */ | ||
2282 | if (!list_empty(&wq->flusher_overflow)) { | ||
2283 | /* | ||
2284 | * Assign the same color to all overflowed | ||
2285 | * flushers, advance work_color and append to | ||
2286 | * flusher_queue. This is the start-to-wait | ||
2287 | * phase for these overflowed flushers. | ||
2288 | */ | ||
2289 | list_for_each_entry(tmp, &wq->flusher_overflow, list) | ||
2290 | tmp->flush_color = wq->work_color; | ||
2291 | |||
2292 | wq->work_color = work_next_color(wq->work_color); | ||
2293 | |||
2294 | list_splice_tail_init(&wq->flusher_overflow, | ||
2295 | &wq->flusher_queue); | ||
2296 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
2297 | } | ||
2298 | |||
2299 | if (list_empty(&wq->flusher_queue)) { | ||
2300 | BUG_ON(wq->flush_color != wq->work_color); | ||
2301 | break; | ||
2302 | } | ||
2303 | |||
2304 | /* | ||
2305 | * Need to flush more colors. Make the next flusher | ||
2306 | * the new first flusher and arm cwqs. | ||
2307 | */ | ||
2308 | BUG_ON(wq->flush_color == wq->work_color); | ||
2309 | BUG_ON(wq->flush_color != next->flush_color); | ||
2310 | |||
2311 | list_del_init(&next->list); | ||
2312 | wq->first_flusher = next; | ||
2313 | |||
2314 | if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) | ||
2315 | break; | ||
2316 | |||
2317 | /* | ||
2318 | * Meh... this color is already done, clear first | ||
2319 | * flusher and repeat cascading. | ||
2320 | */ | ||
2321 | wq->first_flusher = NULL; | ||
2322 | } | ||
2323 | |||
2324 | out_unlock: | ||
2325 | mutex_unlock(&wq->flush_mutex); | ||
535 | } | 2326 | } |
536 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2327 | EXPORT_SYMBOL_GPL(flush_workqueue); |
537 | 2328 | ||
@@ -547,43 +2338,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); | |||
547 | */ | 2338 | */ |
548 | int flush_work(struct work_struct *work) | 2339 | int flush_work(struct work_struct *work) |
549 | { | 2340 | { |
2341 | struct worker *worker = NULL; | ||
2342 | struct global_cwq *gcwq; | ||
550 | struct cpu_workqueue_struct *cwq; | 2343 | struct cpu_workqueue_struct *cwq; |
551 | struct list_head *prev; | ||
552 | struct wq_barrier barr; | 2344 | struct wq_barrier barr; |
553 | 2345 | ||
554 | might_sleep(); | 2346 | might_sleep(); |
555 | cwq = get_wq_data(work); | 2347 | gcwq = get_work_gcwq(work); |
556 | if (!cwq) | 2348 | if (!gcwq) |
557 | return 0; | 2349 | return 0; |
558 | 2350 | ||
559 | lock_map_acquire(&cwq->wq->lockdep_map); | 2351 | spin_lock_irq(&gcwq->lock); |
560 | lock_map_release(&cwq->wq->lockdep_map); | ||
561 | |||
562 | prev = NULL; | ||
563 | spin_lock_irq(&cwq->lock); | ||
564 | if (!list_empty(&work->entry)) { | 2352 | if (!list_empty(&work->entry)) { |
565 | /* | 2353 | /* |
566 | * See the comment near try_to_grab_pending()->smp_rmb(). | 2354 | * See the comment near try_to_grab_pending()->smp_rmb(). |
567 | * If it was re-queued under us we are not going to wait. | 2355 | * If it was re-queued to a different gcwq under us, we |
2356 | * are not going to wait. | ||
568 | */ | 2357 | */ |
569 | smp_rmb(); | 2358 | smp_rmb(); |
570 | if (unlikely(cwq != get_wq_data(work))) | 2359 | cwq = get_work_cwq(work); |
571 | goto out; | 2360 | if (unlikely(!cwq || gcwq != cwq->gcwq)) |
572 | prev = &work->entry; | 2361 | goto already_gone; |
573 | } else { | 2362 | } else { |
574 | if (cwq->current_work != work) | 2363 | worker = find_worker_executing_work(gcwq, work); |
575 | goto out; | 2364 | if (!worker) |
576 | prev = &cwq->worklist; | 2365 | goto already_gone; |
2366 | cwq = worker->current_cwq; | ||
577 | } | 2367 | } |
578 | insert_wq_barrier(cwq, &barr, prev->next); | 2368 | |
579 | out: | 2369 | insert_wq_barrier(cwq, &barr, work, worker); |
580 | spin_unlock_irq(&cwq->lock); | 2370 | spin_unlock_irq(&gcwq->lock); |
581 | if (!prev) | 2371 | |
582 | return 0; | 2372 | lock_map_acquire(&cwq->wq->lockdep_map); |
2373 | lock_map_release(&cwq->wq->lockdep_map); | ||
583 | 2374 | ||
584 | wait_for_completion(&barr.done); | 2375 | wait_for_completion(&barr.done); |
585 | destroy_work_on_stack(&barr.work); | 2376 | destroy_work_on_stack(&barr.work); |
586 | return 1; | 2377 | return 1; |
2378 | already_gone: | ||
2379 | spin_unlock_irq(&gcwq->lock); | ||
2380 | return 0; | ||
587 | } | 2381 | } |
588 | EXPORT_SYMBOL_GPL(flush_work); | 2382 | EXPORT_SYMBOL_GPL(flush_work); |
589 | 2383 | ||
@@ -593,54 +2387,56 @@ EXPORT_SYMBOL_GPL(flush_work); | |||
593 | */ | 2387 | */ |
594 | static int try_to_grab_pending(struct work_struct *work) | 2388 | static int try_to_grab_pending(struct work_struct *work) |
595 | { | 2389 | { |
596 | struct cpu_workqueue_struct *cwq; | 2390 | struct global_cwq *gcwq; |
597 | int ret = -1; | 2391 | int ret = -1; |
598 | 2392 | ||
599 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) | 2393 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) |
600 | return 0; | 2394 | return 0; |
601 | 2395 | ||
602 | /* | 2396 | /* |
603 | * The queueing is in progress, or it is already queued. Try to | 2397 | * The queueing is in progress, or it is already queued. Try to |
604 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | 2398 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. |
605 | */ | 2399 | */ |
606 | 2400 | gcwq = get_work_gcwq(work); | |
607 | cwq = get_wq_data(work); | 2401 | if (!gcwq) |
608 | if (!cwq) | ||
609 | return ret; | 2402 | return ret; |
610 | 2403 | ||
611 | spin_lock_irq(&cwq->lock); | 2404 | spin_lock_irq(&gcwq->lock); |
612 | if (!list_empty(&work->entry)) { | 2405 | if (!list_empty(&work->entry)) { |
613 | /* | 2406 | /* |
614 | * This work is queued, but perhaps we locked the wrong cwq. | 2407 | * This work is queued, but perhaps we locked the wrong gcwq. |
615 | * In that case we must see the new value after rmb(), see | 2408 | * In that case we must see the new value after rmb(), see |
616 | * insert_work()->wmb(). | 2409 | * insert_work()->wmb(). |
617 | */ | 2410 | */ |
618 | smp_rmb(); | 2411 | smp_rmb(); |
619 | if (cwq == get_wq_data(work)) { | 2412 | if (gcwq == get_work_gcwq(work)) { |
620 | debug_work_deactivate(work); | 2413 | debug_work_deactivate(work); |
621 | list_del_init(&work->entry); | 2414 | list_del_init(&work->entry); |
2415 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
2416 | get_work_color(work), | ||
2417 | *work_data_bits(work) & WORK_STRUCT_DELAYED); | ||
622 | ret = 1; | 2418 | ret = 1; |
623 | } | 2419 | } |
624 | } | 2420 | } |
625 | spin_unlock_irq(&cwq->lock); | 2421 | spin_unlock_irq(&gcwq->lock); |
626 | 2422 | ||
627 | return ret; | 2423 | return ret; |
628 | } | 2424 | } |
629 | 2425 | ||
630 | static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, | 2426 | static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) |
631 | struct work_struct *work) | ||
632 | { | 2427 | { |
633 | struct wq_barrier barr; | 2428 | struct wq_barrier barr; |
634 | int running = 0; | 2429 | struct worker *worker; |
635 | 2430 | ||
636 | spin_lock_irq(&cwq->lock); | 2431 | spin_lock_irq(&gcwq->lock); |
637 | if (unlikely(cwq->current_work == work)) { | ||
638 | insert_wq_barrier(cwq, &barr, cwq->worklist.next); | ||
639 | running = 1; | ||
640 | } | ||
641 | spin_unlock_irq(&cwq->lock); | ||
642 | 2432 | ||
643 | if (unlikely(running)) { | 2433 | worker = find_worker_executing_work(gcwq, work); |
2434 | if (unlikely(worker)) | ||
2435 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
2436 | |||
2437 | spin_unlock_irq(&gcwq->lock); | ||
2438 | |||
2439 | if (unlikely(worker)) { | ||
644 | wait_for_completion(&barr.done); | 2440 | wait_for_completion(&barr.done); |
645 | destroy_work_on_stack(&barr.work); | 2441 | destroy_work_on_stack(&barr.work); |
646 | } | 2442 | } |
@@ -648,9 +2444,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, | |||
648 | 2444 | ||
649 | static void wait_on_work(struct work_struct *work) | 2445 | static void wait_on_work(struct work_struct *work) |
650 | { | 2446 | { |
651 | struct cpu_workqueue_struct *cwq; | ||
652 | struct workqueue_struct *wq; | ||
653 | const struct cpumask *cpu_map; | ||
654 | int cpu; | 2447 | int cpu; |
655 | 2448 | ||
656 | might_sleep(); | 2449 | might_sleep(); |
@@ -658,15 +2451,8 @@ static void wait_on_work(struct work_struct *work) | |||
658 | lock_map_acquire(&work->lockdep_map); | 2451 | lock_map_acquire(&work->lockdep_map); |
659 | lock_map_release(&work->lockdep_map); | 2452 | lock_map_release(&work->lockdep_map); |
660 | 2453 | ||
661 | cwq = get_wq_data(work); | 2454 | for_each_gcwq_cpu(cpu) |
662 | if (!cwq) | 2455 | wait_on_cpu_work(get_gcwq(cpu), work); |
663 | return; | ||
664 | |||
665 | wq = cwq->wq; | ||
666 | cpu_map = wq_cpu_map(wq); | ||
667 | |||
668 | for_each_cpu(cpu, cpu_map) | ||
669 | wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | ||
670 | } | 2456 | } |
671 | 2457 | ||
672 | static int __cancel_work_timer(struct work_struct *work, | 2458 | static int __cancel_work_timer(struct work_struct *work, |
@@ -681,7 +2467,7 @@ static int __cancel_work_timer(struct work_struct *work, | |||
681 | wait_on_work(work); | 2467 | wait_on_work(work); |
682 | } while (unlikely(ret < 0)); | 2468 | } while (unlikely(ret < 0)); |
683 | 2469 | ||
684 | clear_wq_data(work); | 2470 | clear_work_data(work); |
685 | return ret; | 2471 | return ret; |
686 | } | 2472 | } |
687 | 2473 | ||
@@ -727,8 +2513,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) | |||
727 | } | 2513 | } |
728 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 2514 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
729 | 2515 | ||
730 | static struct workqueue_struct *keventd_wq __read_mostly; | ||
731 | |||
732 | /** | 2516 | /** |
733 | * schedule_work - put work task in global workqueue | 2517 | * schedule_work - put work task in global workqueue |
734 | * @work: job to be done | 2518 | * @work: job to be done |
@@ -742,7 +2526,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; | |||
742 | */ | 2526 | */ |
743 | int schedule_work(struct work_struct *work) | 2527 | int schedule_work(struct work_struct *work) |
744 | { | 2528 | { |
745 | return queue_work(keventd_wq, work); | 2529 | return queue_work(system_wq, work); |
746 | } | 2530 | } |
747 | EXPORT_SYMBOL(schedule_work); | 2531 | EXPORT_SYMBOL(schedule_work); |
748 | 2532 | ||
@@ -755,7 +2539,7 @@ EXPORT_SYMBOL(schedule_work); | |||
755 | */ | 2539 | */ |
756 | int schedule_work_on(int cpu, struct work_struct *work) | 2540 | int schedule_work_on(int cpu, struct work_struct *work) |
757 | { | 2541 | { |
758 | return queue_work_on(cpu, keventd_wq, work); | 2542 | return queue_work_on(cpu, system_wq, work); |
759 | } | 2543 | } |
760 | EXPORT_SYMBOL(schedule_work_on); | 2544 | EXPORT_SYMBOL(schedule_work_on); |
761 | 2545 | ||
@@ -770,7 +2554,7 @@ EXPORT_SYMBOL(schedule_work_on); | |||
770 | int schedule_delayed_work(struct delayed_work *dwork, | 2554 | int schedule_delayed_work(struct delayed_work *dwork, |
771 | unsigned long delay) | 2555 | unsigned long delay) |
772 | { | 2556 | { |
773 | return queue_delayed_work(keventd_wq, dwork, delay); | 2557 | return queue_delayed_work(system_wq, dwork, delay); |
774 | } | 2558 | } |
775 | EXPORT_SYMBOL(schedule_delayed_work); | 2559 | EXPORT_SYMBOL(schedule_delayed_work); |
776 | 2560 | ||
@@ -783,9 +2567,8 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
783 | void flush_delayed_work(struct delayed_work *dwork) | 2567 | void flush_delayed_work(struct delayed_work *dwork) |
784 | { | 2568 | { |
785 | if (del_timer_sync(&dwork->timer)) { | 2569 | if (del_timer_sync(&dwork->timer)) { |
786 | struct cpu_workqueue_struct *cwq; | 2570 | __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, |
787 | cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); | 2571 | &dwork->work); |
788 | __queue_work(cwq, &dwork->work); | ||
789 | put_cpu(); | 2572 | put_cpu(); |
790 | } | 2573 | } |
791 | flush_work(&dwork->work); | 2574 | flush_work(&dwork->work); |
@@ -804,7 +2587,7 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
804 | int schedule_delayed_work_on(int cpu, | 2587 | int schedule_delayed_work_on(int cpu, |
805 | struct delayed_work *dwork, unsigned long delay) | 2588 | struct delayed_work *dwork, unsigned long delay) |
806 | { | 2589 | { |
807 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); | 2590 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); |
808 | } | 2591 | } |
809 | EXPORT_SYMBOL(schedule_delayed_work_on); | 2592 | EXPORT_SYMBOL(schedule_delayed_work_on); |
810 | 2593 | ||
@@ -820,8 +2603,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
820 | int schedule_on_each_cpu(work_func_t func) | 2603 | int schedule_on_each_cpu(work_func_t func) |
821 | { | 2604 | { |
822 | int cpu; | 2605 | int cpu; |
823 | int orig = -1; | 2606 | struct work_struct __percpu *works; |
824 | struct work_struct *works; | ||
825 | 2607 | ||
826 | works = alloc_percpu(struct work_struct); | 2608 | works = alloc_percpu(struct work_struct); |
827 | if (!works) | 2609 | if (!works) |
@@ -829,23 +2611,12 @@ int schedule_on_each_cpu(work_func_t func) | |||
829 | 2611 | ||
830 | get_online_cpus(); | 2612 | get_online_cpus(); |
831 | 2613 | ||
832 | /* | ||
833 | * When running in keventd don't schedule a work item on | ||
834 | * itself. Can just call directly because the work queue is | ||
835 | * already bound. This also is faster. | ||
836 | */ | ||
837 | if (current_is_keventd()) | ||
838 | orig = raw_smp_processor_id(); | ||
839 | |||
840 | for_each_online_cpu(cpu) { | 2614 | for_each_online_cpu(cpu) { |
841 | struct work_struct *work = per_cpu_ptr(works, cpu); | 2615 | struct work_struct *work = per_cpu_ptr(works, cpu); |
842 | 2616 | ||
843 | INIT_WORK(work, func); | 2617 | INIT_WORK(work, func); |
844 | if (cpu != orig) | 2618 | schedule_work_on(cpu, work); |
845 | schedule_work_on(cpu, work); | ||
846 | } | 2619 | } |
847 | if (orig >= 0) | ||
848 | func(per_cpu_ptr(works, orig)); | ||
849 | 2620 | ||
850 | for_each_online_cpu(cpu) | 2621 | for_each_online_cpu(cpu) |
851 | flush_work(per_cpu_ptr(works, cpu)); | 2622 | flush_work(per_cpu_ptr(works, cpu)); |
@@ -881,7 +2652,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
881 | */ | 2652 | */ |
882 | void flush_scheduled_work(void) | 2653 | void flush_scheduled_work(void) |
883 | { | 2654 | { |
884 | flush_workqueue(keventd_wq); | 2655 | flush_workqueue(system_wq); |
885 | } | 2656 | } |
886 | EXPORT_SYMBOL(flush_scheduled_work); | 2657 | EXPORT_SYMBOL(flush_scheduled_work); |
887 | 2658 | ||
@@ -913,170 +2684,169 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); | |||
913 | 2684 | ||
914 | int keventd_up(void) | 2685 | int keventd_up(void) |
915 | { | 2686 | { |
916 | return keventd_wq != NULL; | 2687 | return system_wq != NULL; |
917 | } | 2688 | } |
918 | 2689 | ||
919 | int current_is_keventd(void) | 2690 | static int alloc_cwqs(struct workqueue_struct *wq) |
920 | { | 2691 | { |
921 | struct cpu_workqueue_struct *cwq; | 2692 | /* |
922 | int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | 2693 | * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. |
923 | int ret = 0; | 2694 | * Make sure that the alignment isn't lower than that of |
924 | 2695 | * unsigned long long. | |
925 | BUG_ON(!keventd_wq); | 2696 | */ |
2697 | const size_t size = sizeof(struct cpu_workqueue_struct); | ||
2698 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | ||
2699 | __alignof__(unsigned long long)); | ||
2700 | #ifdef CONFIG_SMP | ||
2701 | bool percpu = !(wq->flags & WQ_UNBOUND); | ||
2702 | #else | ||
2703 | bool percpu = false; | ||
2704 | #endif | ||
926 | 2705 | ||
927 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); | 2706 | if (percpu) |
928 | if (current == cwq->thread) | 2707 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); |
929 | ret = 1; | 2708 | else { |
2709 | void *ptr; | ||
930 | 2710 | ||
931 | return ret; | 2711 | /* |
2712 | * Allocate enough room to align cwq and put an extra | ||
2713 | * pointer at the end pointing back to the originally | ||
2714 | * allocated pointer which will be used for free. | ||
2715 | */ | ||
2716 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); | ||
2717 | if (ptr) { | ||
2718 | wq->cpu_wq.single = PTR_ALIGN(ptr, align); | ||
2719 | *(void **)(wq->cpu_wq.single + 1) = ptr; | ||
2720 | } | ||
2721 | } | ||
932 | 2722 | ||
2723 | /* just in case, make sure it's actually aligned */ | ||
2724 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | ||
2725 | return wq->cpu_wq.v ? 0 : -ENOMEM; | ||
933 | } | 2726 | } |
934 | 2727 | ||
935 | static struct cpu_workqueue_struct * | 2728 | static void free_cwqs(struct workqueue_struct *wq) |
936 | init_cpu_workqueue(struct workqueue_struct *wq, int cpu) | ||
937 | { | 2729 | { |
938 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 2730 | #ifdef CONFIG_SMP |
939 | 2731 | bool percpu = !(wq->flags & WQ_UNBOUND); | |
940 | cwq->wq = wq; | 2732 | #else |
941 | spin_lock_init(&cwq->lock); | 2733 | bool percpu = false; |
942 | INIT_LIST_HEAD(&cwq->worklist); | 2734 | #endif |
943 | init_waitqueue_head(&cwq->more_work); | ||
944 | 2735 | ||
945 | return cwq; | 2736 | if (percpu) |
2737 | free_percpu(wq->cpu_wq.pcpu); | ||
2738 | else if (wq->cpu_wq.single) { | ||
2739 | /* the pointer to free is stored right after the cwq */ | ||
2740 | kfree(*(void **)(wq->cpu_wq.single + 1)); | ||
2741 | } | ||
946 | } | 2742 | } |
947 | 2743 | ||
948 | static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2744 | static int wq_clamp_max_active(int max_active, unsigned int flags, |
2745 | const char *name) | ||
949 | { | 2746 | { |
950 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 2747 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; |
951 | struct workqueue_struct *wq = cwq->wq; | ||
952 | const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; | ||
953 | struct task_struct *p; | ||
954 | |||
955 | p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); | ||
956 | /* | ||
957 | * Nobody can add the work_struct to this cwq, | ||
958 | * if (caller is __create_workqueue) | ||
959 | * nobody should see this wq | ||
960 | * else // caller is CPU_UP_PREPARE | ||
961 | * cpu is not on cpu_online_map | ||
962 | * so we can abort safely. | ||
963 | */ | ||
964 | if (IS_ERR(p)) | ||
965 | return PTR_ERR(p); | ||
966 | if (cwq->wq->rt) | ||
967 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
968 | cwq->thread = p; | ||
969 | 2748 | ||
970 | trace_workqueue_creation(cwq->thread, cpu); | 2749 | if (max_active < 1 || max_active > lim) |
2750 | printk(KERN_WARNING "workqueue: max_active %d requested for %s " | ||
2751 | "is out of range, clamping between %d and %d\n", | ||
2752 | max_active, name, 1, lim); | ||
971 | 2753 | ||
972 | return 0; | 2754 | return clamp_val(max_active, 1, lim); |
973 | } | 2755 | } |
974 | 2756 | ||
975 | static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2757 | struct workqueue_struct *__alloc_workqueue_key(const char *name, |
2758 | unsigned int flags, | ||
2759 | int max_active, | ||
2760 | struct lock_class_key *key, | ||
2761 | const char *lock_name) | ||
976 | { | 2762 | { |
977 | struct task_struct *p = cwq->thread; | 2763 | struct workqueue_struct *wq; |
2764 | unsigned int cpu; | ||
978 | 2765 | ||
979 | if (p != NULL) { | 2766 | /* |
980 | if (cpu >= 0) | 2767 | * Unbound workqueues aren't concurrency managed and should be |
981 | kthread_bind(p, cpu); | 2768 | * dispatched to workers immediately. |
982 | wake_up_process(p); | 2769 | */ |
983 | } | 2770 | if (flags & WQ_UNBOUND) |
984 | } | 2771 | flags |= WQ_HIGHPRI; |
985 | 2772 | ||
986 | struct workqueue_struct *__create_workqueue_key(const char *name, | 2773 | max_active = max_active ?: WQ_DFL_ACTIVE; |
987 | int singlethread, | 2774 | max_active = wq_clamp_max_active(max_active, flags, name); |
988 | int freezeable, | ||
989 | int rt, | ||
990 | struct lock_class_key *key, | ||
991 | const char *lock_name) | ||
992 | { | ||
993 | struct workqueue_struct *wq; | ||
994 | struct cpu_workqueue_struct *cwq; | ||
995 | int err = 0, cpu; | ||
996 | 2775 | ||
997 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); | 2776 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); |
998 | if (!wq) | 2777 | if (!wq) |
999 | return NULL; | 2778 | goto err; |
1000 | 2779 | ||
1001 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 2780 | wq->flags = flags; |
1002 | if (!wq->cpu_wq) { | 2781 | wq->saved_max_active = max_active; |
1003 | kfree(wq); | 2782 | mutex_init(&wq->flush_mutex); |
1004 | return NULL; | 2783 | atomic_set(&wq->nr_cwqs_to_flush, 0); |
1005 | } | 2784 | INIT_LIST_HEAD(&wq->flusher_queue); |
2785 | INIT_LIST_HEAD(&wq->flusher_overflow); | ||
1006 | 2786 | ||
1007 | wq->name = name; | 2787 | wq->name = name; |
1008 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 2788 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
1009 | wq->singlethread = singlethread; | ||
1010 | wq->freezeable = freezeable; | ||
1011 | wq->rt = rt; | ||
1012 | INIT_LIST_HEAD(&wq->list); | 2789 | INIT_LIST_HEAD(&wq->list); |
1013 | 2790 | ||
1014 | if (singlethread) { | 2791 | if (alloc_cwqs(wq) < 0) |
1015 | cwq = init_cpu_workqueue(wq, singlethread_cpu); | 2792 | goto err; |
1016 | err = create_workqueue_thread(cwq, singlethread_cpu); | 2793 | |
1017 | start_workqueue_thread(cwq, -1); | 2794 | for_each_cwq_cpu(cpu, wq) { |
1018 | } else { | 2795 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
1019 | cpu_maps_update_begin(); | 2796 | struct global_cwq *gcwq = get_gcwq(cpu); |
1020 | /* | 2797 | |
1021 | * We must place this wq on list even if the code below fails. | 2798 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); |
1022 | * cpu_down(cpu) can remove cpu from cpu_populated_map before | 2799 | cwq->gcwq = gcwq; |
1023 | * destroy_workqueue() takes the lock, in that case we leak | 2800 | cwq->wq = wq; |
1024 | * cwq[cpu]->thread. | 2801 | cwq->flush_color = -1; |
1025 | */ | 2802 | cwq->max_active = max_active; |
1026 | spin_lock(&workqueue_lock); | 2803 | INIT_LIST_HEAD(&cwq->delayed_works); |
1027 | list_add(&wq->list, &workqueues); | ||
1028 | spin_unlock(&workqueue_lock); | ||
1029 | /* | ||
1030 | * We must initialize cwqs for each possible cpu even if we | ||
1031 | * are going to call destroy_workqueue() finally. Otherwise | ||
1032 | * cpu_up() can hit the uninitialized cwq once we drop the | ||
1033 | * lock. | ||
1034 | */ | ||
1035 | for_each_possible_cpu(cpu) { | ||
1036 | cwq = init_cpu_workqueue(wq, cpu); | ||
1037 | if (err || !cpu_online(cpu)) | ||
1038 | continue; | ||
1039 | err = create_workqueue_thread(cwq, cpu); | ||
1040 | start_workqueue_thread(cwq, cpu); | ||
1041 | } | ||
1042 | cpu_maps_update_done(); | ||
1043 | } | 2804 | } |
1044 | 2805 | ||
1045 | if (err) { | 2806 | if (flags & WQ_RESCUER) { |
1046 | destroy_workqueue(wq); | 2807 | struct worker *rescuer; |
1047 | wq = NULL; | 2808 | |
2809 | if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) | ||
2810 | goto err; | ||
2811 | |||
2812 | wq->rescuer = rescuer = alloc_worker(); | ||
2813 | if (!rescuer) | ||
2814 | goto err; | ||
2815 | |||
2816 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); | ||
2817 | if (IS_ERR(rescuer->task)) | ||
2818 | goto err; | ||
2819 | |||
2820 | rescuer->task->flags |= PF_THREAD_BOUND; | ||
2821 | wake_up_process(rescuer->task); | ||
1048 | } | 2822 | } |
1049 | return wq; | ||
1050 | } | ||
1051 | EXPORT_SYMBOL_GPL(__create_workqueue_key); | ||
1052 | 2823 | ||
1053 | static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | ||
1054 | { | ||
1055 | /* | 2824 | /* |
1056 | * Our caller is either destroy_workqueue() or CPU_POST_DEAD, | 2825 | * workqueue_lock protects global freeze state and workqueues |
1057 | * cpu_add_remove_lock protects cwq->thread. | 2826 | * list. Grab it, set max_active accordingly and add the new |
2827 | * workqueue to workqueues list. | ||
1058 | */ | 2828 | */ |
1059 | if (cwq->thread == NULL) | 2829 | spin_lock(&workqueue_lock); |
1060 | return; | ||
1061 | 2830 | ||
1062 | lock_map_acquire(&cwq->wq->lockdep_map); | 2831 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) |
1063 | lock_map_release(&cwq->wq->lockdep_map); | 2832 | for_each_cwq_cpu(cpu, wq) |
2833 | get_cwq(cpu, wq)->max_active = 0; | ||
1064 | 2834 | ||
1065 | flush_cpu_workqueue(cwq); | 2835 | list_add(&wq->list, &workqueues); |
1066 | /* | 2836 | |
1067 | * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, | 2837 | spin_unlock(&workqueue_lock); |
1068 | * a concurrent flush_workqueue() can insert a barrier after us. | 2838 | |
1069 | * However, in that case run_workqueue() won't return and check | 2839 | return wq; |
1070 | * kthread_should_stop() until it flushes all work_struct's. | 2840 | err: |
1071 | * When ->worklist becomes empty it is safe to exit because no | 2841 | if (wq) { |
1072 | * more work_structs can be queued on this cwq: flush_workqueue | 2842 | free_cwqs(wq); |
1073 | * checks list_empty(), and a "normal" queue_work() can't use | 2843 | free_mayday_mask(wq->mayday_mask); |
1074 | * a dead CPU. | 2844 | kfree(wq->rescuer); |
1075 | */ | 2845 | kfree(wq); |
1076 | trace_workqueue_destruction(cwq->thread); | 2846 | } |
1077 | kthread_stop(cwq->thread); | 2847 | return NULL; |
1078 | cwq->thread = NULL; | ||
1079 | } | 2848 | } |
2849 | EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | ||
1080 | 2850 | ||
1081 | /** | 2851 | /** |
1082 | * destroy_workqueue - safely terminate a workqueue | 2852 | * destroy_workqueue - safely terminate a workqueue |
@@ -1086,72 +2856,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | |||
1086 | */ | 2856 | */ |
1087 | void destroy_workqueue(struct workqueue_struct *wq) | 2857 | void destroy_workqueue(struct workqueue_struct *wq) |
1088 | { | 2858 | { |
1089 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2859 | unsigned int cpu; |
1090 | int cpu; | 2860 | |
2861 | wq->flags |= WQ_DYING; | ||
2862 | flush_workqueue(wq); | ||
1091 | 2863 | ||
1092 | cpu_maps_update_begin(); | 2864 | /* |
2865 | * wq list is used to freeze wq, remove from list after | ||
2866 | * flushing is complete in case freeze races us. | ||
2867 | */ | ||
1093 | spin_lock(&workqueue_lock); | 2868 | spin_lock(&workqueue_lock); |
1094 | list_del(&wq->list); | 2869 | list_del(&wq->list); |
1095 | spin_unlock(&workqueue_lock); | 2870 | spin_unlock(&workqueue_lock); |
1096 | 2871 | ||
1097 | for_each_cpu(cpu, cpu_map) | 2872 | /* sanity check */ |
1098 | cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); | 2873 | for_each_cwq_cpu(cpu, wq) { |
1099 | cpu_maps_update_done(); | 2874 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
2875 | int i; | ||
2876 | |||
2877 | for (i = 0; i < WORK_NR_COLORS; i++) | ||
2878 | BUG_ON(cwq->nr_in_flight[i]); | ||
2879 | BUG_ON(cwq->nr_active); | ||
2880 | BUG_ON(!list_empty(&cwq->delayed_works)); | ||
2881 | } | ||
1100 | 2882 | ||
1101 | free_percpu(wq->cpu_wq); | 2883 | if (wq->flags & WQ_RESCUER) { |
2884 | kthread_stop(wq->rescuer->task); | ||
2885 | free_mayday_mask(wq->mayday_mask); | ||
2886 | kfree(wq->rescuer); | ||
2887 | } | ||
2888 | |||
2889 | free_cwqs(wq); | ||
1102 | kfree(wq); | 2890 | kfree(wq); |
1103 | } | 2891 | } |
1104 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 2892 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
1105 | 2893 | ||
2894 | /** | ||
2895 | * workqueue_set_max_active - adjust max_active of a workqueue | ||
2896 | * @wq: target workqueue | ||
2897 | * @max_active: new max_active value. | ||
2898 | * | ||
2899 | * Set max_active of @wq to @max_active. | ||
2900 | * | ||
2901 | * CONTEXT: | ||
2902 | * Don't call from IRQ context. | ||
2903 | */ | ||
2904 | void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | ||
2905 | { | ||
2906 | unsigned int cpu; | ||
2907 | |||
2908 | max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); | ||
2909 | |||
2910 | spin_lock(&workqueue_lock); | ||
2911 | |||
2912 | wq->saved_max_active = max_active; | ||
2913 | |||
2914 | for_each_cwq_cpu(cpu, wq) { | ||
2915 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
2916 | |||
2917 | spin_lock_irq(&gcwq->lock); | ||
2918 | |||
2919 | if (!(wq->flags & WQ_FREEZEABLE) || | ||
2920 | !(gcwq->flags & GCWQ_FREEZING)) | ||
2921 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | ||
2922 | |||
2923 | spin_unlock_irq(&gcwq->lock); | ||
2924 | } | ||
2925 | |||
2926 | spin_unlock(&workqueue_lock); | ||
2927 | } | ||
2928 | EXPORT_SYMBOL_GPL(workqueue_set_max_active); | ||
2929 | |||
2930 | /** | ||
2931 | * workqueue_congested - test whether a workqueue is congested | ||
2932 | * @cpu: CPU in question | ||
2933 | * @wq: target workqueue | ||
2934 | * | ||
2935 | * Test whether @wq's cpu workqueue for @cpu is congested. There is | ||
2936 | * no synchronization around this function and the test result is | ||
2937 | * unreliable and only useful as advisory hints or for debugging. | ||
2938 | * | ||
2939 | * RETURNS: | ||
2940 | * %true if congested, %false otherwise. | ||
2941 | */ | ||
2942 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) | ||
2943 | { | ||
2944 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
2945 | |||
2946 | return !list_empty(&cwq->delayed_works); | ||
2947 | } | ||
2948 | EXPORT_SYMBOL_GPL(workqueue_congested); | ||
2949 | |||
2950 | /** | ||
2951 | * work_cpu - return the last known associated cpu for @work | ||
2952 | * @work: the work of interest | ||
2953 | * | ||
2954 | * RETURNS: | ||
2955 | * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. | ||
2956 | */ | ||
2957 | unsigned int work_cpu(struct work_struct *work) | ||
2958 | { | ||
2959 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
2960 | |||
2961 | return gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
2962 | } | ||
2963 | EXPORT_SYMBOL_GPL(work_cpu); | ||
2964 | |||
2965 | /** | ||
2966 | * work_busy - test whether a work is currently pending or running | ||
2967 | * @work: the work to be tested | ||
2968 | * | ||
2969 | * Test whether @work is currently pending or running. There is no | ||
2970 | * synchronization around this function and the test result is | ||
2971 | * unreliable and only useful as advisory hints or for debugging. | ||
2972 | * Especially for reentrant wqs, the pending state might hide the | ||
2973 | * running state. | ||
2974 | * | ||
2975 | * RETURNS: | ||
2976 | * OR'd bitmask of WORK_BUSY_* bits. | ||
2977 | */ | ||
2978 | unsigned int work_busy(struct work_struct *work) | ||
2979 | { | ||
2980 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
2981 | unsigned long flags; | ||
2982 | unsigned int ret = 0; | ||
2983 | |||
2984 | if (!gcwq) | ||
2985 | return false; | ||
2986 | |||
2987 | spin_lock_irqsave(&gcwq->lock, flags); | ||
2988 | |||
2989 | if (work_pending(work)) | ||
2990 | ret |= WORK_BUSY_PENDING; | ||
2991 | if (find_worker_executing_work(gcwq, work)) | ||
2992 | ret |= WORK_BUSY_RUNNING; | ||
2993 | |||
2994 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
2995 | |||
2996 | return ret; | ||
2997 | } | ||
2998 | EXPORT_SYMBOL_GPL(work_busy); | ||
2999 | |||
3000 | /* | ||
3001 | * CPU hotplug. | ||
3002 | * | ||
3003 | * There are two challenges in supporting CPU hotplug. Firstly, there | ||
3004 | * are a lot of assumptions on strong associations among work, cwq and | ||
3005 | * gcwq which make migrating pending and scheduled works very | ||
3006 | * difficult to implement without impacting hot paths. Secondly, | ||
3007 | * gcwqs serve mix of short, long and very long running works making | ||
3008 | * blocked draining impractical. | ||
3009 | * | ||
3010 | * This is solved by allowing a gcwq to be detached from CPU, running | ||
3011 | * it with unbound (rogue) workers and allowing it to be reattached | ||
3012 | * later if the cpu comes back online. A separate thread is created | ||
3013 | * to govern a gcwq in such state and is called the trustee of the | ||
3014 | * gcwq. | ||
3015 | * | ||
3016 | * Trustee states and their descriptions. | ||
3017 | * | ||
3018 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
3019 | * new trustee is started with this state. | ||
3020 | * | ||
3021 | * IN_CHARGE Once started, trustee will enter this state after | ||
3022 | * assuming the manager role and making all existing | ||
3023 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
3024 | * enter this state. After reaching IN_CHARGE, trustee | ||
3025 | * tries to execute the pending worklist until it's empty | ||
3026 | * and the state is set to BUTCHER, or the state is set | ||
3027 | * to RELEASE. | ||
3028 | * | ||
3029 | * BUTCHER Command state which is set by the cpu callback after | ||
3030 | * the cpu has went down. Once this state is set trustee | ||
3031 | * knows that there will be no new works on the worklist | ||
3032 | * and once the worklist is empty it can proceed to | ||
3033 | * killing idle workers. | ||
3034 | * | ||
3035 | * RELEASE Command state which is set by the cpu callback if the | ||
3036 | * cpu down has been canceled or it has come online | ||
3037 | * again. After recognizing this state, trustee stops | ||
3038 | * trying to drain or butcher and clears ROGUE, rebinds | ||
3039 | * all remaining workers back to the cpu and releases | ||
3040 | * manager role. | ||
3041 | * | ||
3042 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
3043 | * is complete. | ||
3044 | * | ||
3045 | * trustee CPU draining | ||
3046 | * took over down complete | ||
3047 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
3048 | * | | ^ | ||
3049 | * | CPU is back online v return workers | | ||
3050 | * ----------------> RELEASE -------------- | ||
3051 | */ | ||
3052 | |||
3053 | /** | ||
3054 | * trustee_wait_event_timeout - timed event wait for trustee | ||
3055 | * @cond: condition to wait for | ||
3056 | * @timeout: timeout in jiffies | ||
3057 | * | ||
3058 | * wait_event_timeout() for trustee to use. Handles locking and | ||
3059 | * checks for RELEASE request. | ||
3060 | * | ||
3061 | * CONTEXT: | ||
3062 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3063 | * multiple times. To be used by trustee. | ||
3064 | * | ||
3065 | * RETURNS: | ||
3066 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
3067 | * out, -1 if canceled. | ||
3068 | */ | ||
3069 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
3070 | long __ret = (timeout); \ | ||
3071 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
3072 | __ret) { \ | ||
3073 | spin_unlock_irq(&gcwq->lock); \ | ||
3074 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
3075 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
3076 | __ret); \ | ||
3077 | spin_lock_irq(&gcwq->lock); \ | ||
3078 | } \ | ||
3079 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
3080 | }) | ||
3081 | |||
3082 | /** | ||
3083 | * trustee_wait_event - event wait for trustee | ||
3084 | * @cond: condition to wait for | ||
3085 | * | ||
3086 | * wait_event() for trustee to use. Automatically handles locking and | ||
3087 | * checks for CANCEL request. | ||
3088 | * | ||
3089 | * CONTEXT: | ||
3090 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3091 | * multiple times. To be used by trustee. | ||
3092 | * | ||
3093 | * RETURNS: | ||
3094 | * 0 if @cond is satisfied, -1 if canceled. | ||
3095 | */ | ||
3096 | #define trustee_wait_event(cond) ({ \ | ||
3097 | long __ret1; \ | ||
3098 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
3099 | __ret1 < 0 ? -1 : 0; \ | ||
3100 | }) | ||
3101 | |||
3102 | static int __cpuinit trustee_thread(void *__gcwq) | ||
3103 | { | ||
3104 | struct global_cwq *gcwq = __gcwq; | ||
3105 | struct worker *worker; | ||
3106 | struct work_struct *work; | ||
3107 | struct hlist_node *pos; | ||
3108 | long rc; | ||
3109 | int i; | ||
3110 | |||
3111 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3112 | |||
3113 | spin_lock_irq(&gcwq->lock); | ||
3114 | /* | ||
3115 | * Claim the manager position and make all workers rogue. | ||
3116 | * Trustee must be bound to the target cpu and can't be | ||
3117 | * cancelled. | ||
3118 | */ | ||
3119 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
3120 | rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); | ||
3121 | BUG_ON(rc < 0); | ||
3122 | |||
3123 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
3124 | |||
3125 | list_for_each_entry(worker, &gcwq->idle_list, entry) | ||
3126 | worker->flags |= WORKER_ROGUE; | ||
3127 | |||
3128 | for_each_busy_worker(worker, i, pos, gcwq) | ||
3129 | worker->flags |= WORKER_ROGUE; | ||
3130 | |||
3131 | /* | ||
3132 | * Call schedule() so that we cross rq->lock and thus can | ||
3133 | * guarantee sched callbacks see the rogue flag. This is | ||
3134 | * necessary as scheduler callbacks may be invoked from other | ||
3135 | * cpus. | ||
3136 | */ | ||
3137 | spin_unlock_irq(&gcwq->lock); | ||
3138 | schedule(); | ||
3139 | spin_lock_irq(&gcwq->lock); | ||
3140 | |||
3141 | /* | ||
3142 | * Sched callbacks are disabled now. Zap nr_running. After | ||
3143 | * this, nr_running stays zero and need_more_worker() and | ||
3144 | * keep_working() are always true as long as the worklist is | ||
3145 | * not empty. | ||
3146 | */ | ||
3147 | atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); | ||
3148 | |||
3149 | spin_unlock_irq(&gcwq->lock); | ||
3150 | del_timer_sync(&gcwq->idle_timer); | ||
3151 | spin_lock_irq(&gcwq->lock); | ||
3152 | |||
3153 | /* | ||
3154 | * We're now in charge. Notify and proceed to drain. We need | ||
3155 | * to keep the gcwq running during the whole CPU down | ||
3156 | * procedure as other cpu hotunplug callbacks may need to | ||
3157 | * flush currently running tasks. | ||
3158 | */ | ||
3159 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
3160 | wake_up_all(&gcwq->trustee_wait); | ||
3161 | |||
3162 | /* | ||
3163 | * The original cpu is in the process of dying and may go away | ||
3164 | * anytime now. When that happens, we and all workers would | ||
3165 | * be migrated to other cpus. Try draining any left work. We | ||
3166 | * want to get it over with ASAP - spam rescuers, wake up as | ||
3167 | * many idlers as necessary and create new ones till the | ||
3168 | * worklist is empty. Note that if the gcwq is frozen, there | ||
3169 | * may be frozen works in freezeable cwqs. Don't declare | ||
3170 | * completion while frozen. | ||
3171 | */ | ||
3172 | while (gcwq->nr_workers != gcwq->nr_idle || | ||
3173 | gcwq->flags & GCWQ_FREEZING || | ||
3174 | gcwq->trustee_state == TRUSTEE_IN_CHARGE) { | ||
3175 | int nr_works = 0; | ||
3176 | |||
3177 | list_for_each_entry(work, &gcwq->worklist, entry) { | ||
3178 | send_mayday(work); | ||
3179 | nr_works++; | ||
3180 | } | ||
3181 | |||
3182 | list_for_each_entry(worker, &gcwq->idle_list, entry) { | ||
3183 | if (!nr_works--) | ||
3184 | break; | ||
3185 | wake_up_process(worker->task); | ||
3186 | } | ||
3187 | |||
3188 | if (need_to_create_worker(gcwq)) { | ||
3189 | spin_unlock_irq(&gcwq->lock); | ||
3190 | worker = create_worker(gcwq, false); | ||
3191 | spin_lock_irq(&gcwq->lock); | ||
3192 | if (worker) { | ||
3193 | worker->flags |= WORKER_ROGUE; | ||
3194 | start_worker(worker); | ||
3195 | } | ||
3196 | } | ||
3197 | |||
3198 | /* give a breather */ | ||
3199 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
3200 | break; | ||
3201 | } | ||
3202 | |||
3203 | /* | ||
3204 | * Either all works have been scheduled and cpu is down, or | ||
3205 | * cpu down has already been canceled. Wait for and butcher | ||
3206 | * all workers till we're canceled. | ||
3207 | */ | ||
3208 | do { | ||
3209 | rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); | ||
3210 | while (!list_empty(&gcwq->idle_list)) | ||
3211 | destroy_worker(list_first_entry(&gcwq->idle_list, | ||
3212 | struct worker, entry)); | ||
3213 | } while (gcwq->nr_workers && rc >= 0); | ||
3214 | |||
3215 | /* | ||
3216 | * At this point, either draining has completed and no worker | ||
3217 | * is left, or cpu down has been canceled or the cpu is being | ||
3218 | * brought back up. There shouldn't be any idle one left. | ||
3219 | * Tell the remaining busy ones to rebind once it finishes the | ||
3220 | * currently scheduled works by scheduling the rebind_work. | ||
3221 | */ | ||
3222 | WARN_ON(!list_empty(&gcwq->idle_list)); | ||
3223 | |||
3224 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
3225 | struct work_struct *rebind_work = &worker->rebind_work; | ||
3226 | |||
3227 | /* | ||
3228 | * Rebind_work may race with future cpu hotplug | ||
3229 | * operations. Use a separate flag to mark that | ||
3230 | * rebinding is scheduled. | ||
3231 | */ | ||
3232 | worker->flags |= WORKER_REBIND; | ||
3233 | worker->flags &= ~WORKER_ROGUE; | ||
3234 | |||
3235 | /* queue rebind_work, wq doesn't matter, use the default one */ | ||
3236 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
3237 | work_data_bits(rebind_work))) | ||
3238 | continue; | ||
3239 | |||
3240 | debug_work_activate(rebind_work); | ||
3241 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
3242 | worker->scheduled.next, | ||
3243 | work_color_to_flags(WORK_NO_COLOR)); | ||
3244 | } | ||
3245 | |||
3246 | /* relinquish manager role */ | ||
3247 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
3248 | |||
3249 | /* notify completion */ | ||
3250 | gcwq->trustee = NULL; | ||
3251 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3252 | wake_up_all(&gcwq->trustee_wait); | ||
3253 | spin_unlock_irq(&gcwq->lock); | ||
3254 | return 0; | ||
3255 | } | ||
3256 | |||
3257 | /** | ||
3258 | * wait_trustee_state - wait for trustee to enter the specified state | ||
3259 | * @gcwq: gcwq the trustee of interest belongs to | ||
3260 | * @state: target state to wait for | ||
3261 | * | ||
3262 | * Wait for the trustee to reach @state. DONE is already matched. | ||
3263 | * | ||
3264 | * CONTEXT: | ||
3265 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
3266 | * multiple times. To be used by cpu_callback. | ||
3267 | */ | ||
3268 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | ||
3269 | __releases(&gcwq->lock) | ||
3270 | __acquires(&gcwq->lock) | ||
3271 | { | ||
3272 | if (!(gcwq->trustee_state == state || | ||
3273 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
3274 | spin_unlock_irq(&gcwq->lock); | ||
3275 | __wait_event(gcwq->trustee_wait, | ||
3276 | gcwq->trustee_state == state || | ||
3277 | gcwq->trustee_state == TRUSTEE_DONE); | ||
3278 | spin_lock_irq(&gcwq->lock); | ||
3279 | } | ||
3280 | } | ||
3281 | |||
1106 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | 3282 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
1107 | unsigned long action, | 3283 | unsigned long action, |
1108 | void *hcpu) | 3284 | void *hcpu) |
1109 | { | 3285 | { |
1110 | unsigned int cpu = (unsigned long)hcpu; | 3286 | unsigned int cpu = (unsigned long)hcpu; |
1111 | struct cpu_workqueue_struct *cwq; | 3287 | struct global_cwq *gcwq = get_gcwq(cpu); |
1112 | struct workqueue_struct *wq; | 3288 | struct task_struct *new_trustee = NULL; |
1113 | int err = 0; | 3289 | struct worker *uninitialized_var(new_worker); |
3290 | unsigned long flags; | ||
1114 | 3291 | ||
1115 | action &= ~CPU_TASKS_FROZEN; | 3292 | action &= ~CPU_TASKS_FROZEN; |
1116 | 3293 | ||
1117 | switch (action) { | 3294 | switch (action) { |
3295 | case CPU_DOWN_PREPARE: | ||
3296 | new_trustee = kthread_create(trustee_thread, gcwq, | ||
3297 | "workqueue_trustee/%d\n", cpu); | ||
3298 | if (IS_ERR(new_trustee)) | ||
3299 | return notifier_from_errno(PTR_ERR(new_trustee)); | ||
3300 | kthread_bind(new_trustee, cpu); | ||
3301 | /* fall through */ | ||
1118 | case CPU_UP_PREPARE: | 3302 | case CPU_UP_PREPARE: |
1119 | cpumask_set_cpu(cpu, cpu_populated_map); | 3303 | BUG_ON(gcwq->first_idle); |
1120 | } | 3304 | new_worker = create_worker(gcwq, false); |
1121 | undo: | 3305 | if (!new_worker) { |
1122 | list_for_each_entry(wq, &workqueues, list) { | 3306 | if (new_trustee) |
1123 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 3307 | kthread_stop(new_trustee); |
1124 | 3308 | return NOTIFY_BAD; | |
1125 | switch (action) { | ||
1126 | case CPU_UP_PREPARE: | ||
1127 | err = create_workqueue_thread(cwq, cpu); | ||
1128 | if (!err) | ||
1129 | break; | ||
1130 | printk(KERN_ERR "workqueue [%s] for %i failed\n", | ||
1131 | wq->name, cpu); | ||
1132 | action = CPU_UP_CANCELED; | ||
1133 | err = -ENOMEM; | ||
1134 | goto undo; | ||
1135 | |||
1136 | case CPU_ONLINE: | ||
1137 | start_workqueue_thread(cwq, cpu); | ||
1138 | break; | ||
1139 | |||
1140 | case CPU_UP_CANCELED: | ||
1141 | start_workqueue_thread(cwq, -1); | ||
1142 | case CPU_POST_DEAD: | ||
1143 | cleanup_workqueue_thread(cwq); | ||
1144 | break; | ||
1145 | } | 3309 | } |
1146 | } | 3310 | } |
1147 | 3311 | ||
3312 | /* some are called w/ irq disabled, don't disturb irq status */ | ||
3313 | spin_lock_irqsave(&gcwq->lock, flags); | ||
3314 | |||
1148 | switch (action) { | 3315 | switch (action) { |
1149 | case CPU_UP_CANCELED: | 3316 | case CPU_DOWN_PREPARE: |
3317 | /* initialize trustee and tell it to acquire the gcwq */ | ||
3318 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
3319 | gcwq->trustee = new_trustee; | ||
3320 | gcwq->trustee_state = TRUSTEE_START; | ||
3321 | wake_up_process(gcwq->trustee); | ||
3322 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
3323 | /* fall through */ | ||
3324 | case CPU_UP_PREPARE: | ||
3325 | BUG_ON(gcwq->first_idle); | ||
3326 | gcwq->first_idle = new_worker; | ||
3327 | break; | ||
3328 | |||
3329 | case CPU_DYING: | ||
3330 | /* | ||
3331 | * Before this, the trustee and all workers except for | ||
3332 | * the ones which are still executing works from | ||
3333 | * before the last CPU down must be on the cpu. After | ||
3334 | * this, they'll all be diasporas. | ||
3335 | */ | ||
3336 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3337 | break; | ||
3338 | |||
1150 | case CPU_POST_DEAD: | 3339 | case CPU_POST_DEAD: |
1151 | cpumask_clear_cpu(cpu, cpu_populated_map); | 3340 | gcwq->trustee_state = TRUSTEE_BUTCHER; |
3341 | /* fall through */ | ||
3342 | case CPU_UP_CANCELED: | ||
3343 | destroy_worker(gcwq->first_idle); | ||
3344 | gcwq->first_idle = NULL; | ||
3345 | break; | ||
3346 | |||
3347 | case CPU_DOWN_FAILED: | ||
3348 | case CPU_ONLINE: | ||
3349 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
3350 | if (gcwq->trustee_state != TRUSTEE_DONE) { | ||
3351 | gcwq->trustee_state = TRUSTEE_RELEASE; | ||
3352 | wake_up_process(gcwq->trustee); | ||
3353 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
3354 | } | ||
3355 | |||
3356 | /* | ||
3357 | * Trustee is done and there might be no worker left. | ||
3358 | * Put the first_idle in and request a real manager to | ||
3359 | * take a look. | ||
3360 | */ | ||
3361 | spin_unlock_irq(&gcwq->lock); | ||
3362 | kthread_bind(gcwq->first_idle->task, cpu); | ||
3363 | spin_lock_irq(&gcwq->lock); | ||
3364 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
3365 | start_worker(gcwq->first_idle); | ||
3366 | gcwq->first_idle = NULL; | ||
3367 | break; | ||
1152 | } | 3368 | } |
1153 | 3369 | ||
1154 | return notifier_from_errno(err); | 3370 | spin_unlock_irqrestore(&gcwq->lock, flags); |
3371 | |||
3372 | return notifier_from_errno(0); | ||
1155 | } | 3373 | } |
1156 | 3374 | ||
1157 | #ifdef CONFIG_SMP | 3375 | #ifdef CONFIG_SMP |
@@ -1201,14 +3419,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) | |||
1201 | EXPORT_SYMBOL_GPL(work_on_cpu); | 3419 | EXPORT_SYMBOL_GPL(work_on_cpu); |
1202 | #endif /* CONFIG_SMP */ | 3420 | #endif /* CONFIG_SMP */ |
1203 | 3421 | ||
1204 | void __init init_workqueues(void) | 3422 | #ifdef CONFIG_FREEZER |
3423 | |||
3424 | /** | ||
3425 | * freeze_workqueues_begin - begin freezing workqueues | ||
3426 | * | ||
3427 | * Start freezing workqueues. After this function returns, all | ||
3428 | * freezeable workqueues will queue new works to their frozen_works | ||
3429 | * list instead of gcwq->worklist. | ||
3430 | * | ||
3431 | * CONTEXT: | ||
3432 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
3433 | */ | ||
3434 | void freeze_workqueues_begin(void) | ||
3435 | { | ||
3436 | unsigned int cpu; | ||
3437 | |||
3438 | spin_lock(&workqueue_lock); | ||
3439 | |||
3440 | BUG_ON(workqueue_freezing); | ||
3441 | workqueue_freezing = true; | ||
3442 | |||
3443 | for_each_gcwq_cpu(cpu) { | ||
3444 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3445 | struct workqueue_struct *wq; | ||
3446 | |||
3447 | spin_lock_irq(&gcwq->lock); | ||
3448 | |||
3449 | BUG_ON(gcwq->flags & GCWQ_FREEZING); | ||
3450 | gcwq->flags |= GCWQ_FREEZING; | ||
3451 | |||
3452 | list_for_each_entry(wq, &workqueues, list) { | ||
3453 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3454 | |||
3455 | if (cwq && wq->flags & WQ_FREEZEABLE) | ||
3456 | cwq->max_active = 0; | ||
3457 | } | ||
3458 | |||
3459 | spin_unlock_irq(&gcwq->lock); | ||
3460 | } | ||
3461 | |||
3462 | spin_unlock(&workqueue_lock); | ||
3463 | } | ||
3464 | |||
3465 | /** | ||
3466 | * freeze_workqueues_busy - are freezeable workqueues still busy? | ||
3467 | * | ||
3468 | * Check whether freezing is complete. This function must be called | ||
3469 | * between freeze_workqueues_begin() and thaw_workqueues(). | ||
3470 | * | ||
3471 | * CONTEXT: | ||
3472 | * Grabs and releases workqueue_lock. | ||
3473 | * | ||
3474 | * RETURNS: | ||
3475 | * %true if some freezeable workqueues are still busy. %false if | ||
3476 | * freezing is complete. | ||
3477 | */ | ||
3478 | bool freeze_workqueues_busy(void) | ||
3479 | { | ||
3480 | unsigned int cpu; | ||
3481 | bool busy = false; | ||
3482 | |||
3483 | spin_lock(&workqueue_lock); | ||
3484 | |||
3485 | BUG_ON(!workqueue_freezing); | ||
3486 | |||
3487 | for_each_gcwq_cpu(cpu) { | ||
3488 | struct workqueue_struct *wq; | ||
3489 | /* | ||
3490 | * nr_active is monotonically decreasing. It's safe | ||
3491 | * to peek without lock. | ||
3492 | */ | ||
3493 | list_for_each_entry(wq, &workqueues, list) { | ||
3494 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3495 | |||
3496 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
3497 | continue; | ||
3498 | |||
3499 | BUG_ON(cwq->nr_active < 0); | ||
3500 | if (cwq->nr_active) { | ||
3501 | busy = true; | ||
3502 | goto out_unlock; | ||
3503 | } | ||
3504 | } | ||
3505 | } | ||
3506 | out_unlock: | ||
3507 | spin_unlock(&workqueue_lock); | ||
3508 | return busy; | ||
3509 | } | ||
3510 | |||
3511 | /** | ||
3512 | * thaw_workqueues - thaw workqueues | ||
3513 | * | ||
3514 | * Thaw workqueues. Normal queueing is restored and all collected | ||
3515 | * frozen works are transferred to their respective gcwq worklists. | ||
3516 | * | ||
3517 | * CONTEXT: | ||
3518 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
3519 | */ | ||
3520 | void thaw_workqueues(void) | ||
1205 | { | 3521 | { |
1206 | alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); | 3522 | unsigned int cpu; |
3523 | |||
3524 | spin_lock(&workqueue_lock); | ||
3525 | |||
3526 | if (!workqueue_freezing) | ||
3527 | goto out_unlock; | ||
1207 | 3528 | ||
1208 | cpumask_copy(cpu_populated_map, cpu_online_mask); | 3529 | for_each_gcwq_cpu(cpu) { |
1209 | singlethread_cpu = cpumask_first(cpu_possible_mask); | 3530 | struct global_cwq *gcwq = get_gcwq(cpu); |
1210 | cpu_singlethread_map = cpumask_of(singlethread_cpu); | 3531 | struct workqueue_struct *wq; |
1211 | hotcpu_notifier(workqueue_cpu_callback, 0); | 3532 | |
1212 | keventd_wq = create_workqueue("events"); | 3533 | spin_lock_irq(&gcwq->lock); |
1213 | BUG_ON(!keventd_wq); | 3534 | |
3535 | BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); | ||
3536 | gcwq->flags &= ~GCWQ_FREEZING; | ||
3537 | |||
3538 | list_for_each_entry(wq, &workqueues, list) { | ||
3539 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
3540 | |||
3541 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
3542 | continue; | ||
3543 | |||
3544 | /* restore max_active and repopulate worklist */ | ||
3545 | cwq->max_active = wq->saved_max_active; | ||
3546 | |||
3547 | while (!list_empty(&cwq->delayed_works) && | ||
3548 | cwq->nr_active < cwq->max_active) | ||
3549 | cwq_activate_first_delayed(cwq); | ||
3550 | } | ||
3551 | |||
3552 | wake_up_worker(gcwq); | ||
3553 | |||
3554 | spin_unlock_irq(&gcwq->lock); | ||
3555 | } | ||
3556 | |||
3557 | workqueue_freezing = false; | ||
3558 | out_unlock: | ||
3559 | spin_unlock(&workqueue_lock); | ||
3560 | } | ||
3561 | #endif /* CONFIG_FREEZER */ | ||
3562 | |||
3563 | static int __init init_workqueues(void) | ||
3564 | { | ||
3565 | unsigned int cpu; | ||
3566 | int i; | ||
3567 | |||
3568 | cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); | ||
3569 | |||
3570 | /* initialize gcwqs */ | ||
3571 | for_each_gcwq_cpu(cpu) { | ||
3572 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3573 | |||
3574 | spin_lock_init(&gcwq->lock); | ||
3575 | INIT_LIST_HEAD(&gcwq->worklist); | ||
3576 | gcwq->cpu = cpu; | ||
3577 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
3578 | |||
3579 | INIT_LIST_HEAD(&gcwq->idle_list); | ||
3580 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | ||
3581 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | ||
3582 | |||
3583 | init_timer_deferrable(&gcwq->idle_timer); | ||
3584 | gcwq->idle_timer.function = idle_worker_timeout; | ||
3585 | gcwq->idle_timer.data = (unsigned long)gcwq; | ||
3586 | |||
3587 | setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, | ||
3588 | (unsigned long)gcwq); | ||
3589 | |||
3590 | ida_init(&gcwq->worker_ida); | ||
3591 | |||
3592 | gcwq->trustee_state = TRUSTEE_DONE; | ||
3593 | init_waitqueue_head(&gcwq->trustee_wait); | ||
3594 | } | ||
3595 | |||
3596 | /* create the initial worker */ | ||
3597 | for_each_online_gcwq_cpu(cpu) { | ||
3598 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
3599 | struct worker *worker; | ||
3600 | |||
3601 | if (cpu != WORK_CPU_UNBOUND) | ||
3602 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
3603 | worker = create_worker(gcwq, true); | ||
3604 | BUG_ON(!worker); | ||
3605 | spin_lock_irq(&gcwq->lock); | ||
3606 | start_worker(worker); | ||
3607 | spin_unlock_irq(&gcwq->lock); | ||
3608 | } | ||
3609 | |||
3610 | system_wq = alloc_workqueue("events", 0, 0); | ||
3611 | system_long_wq = alloc_workqueue("events_long", 0, 0); | ||
3612 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | ||
3613 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | ||
3614 | WQ_UNBOUND_MAX_ACTIVE); | ||
3615 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | ||
3616 | return 0; | ||
1214 | } | 3617 | } |
3618 | early_initcall(init_workqueues); | ||
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 000000000000..2d10fc98dc79 --- /dev/null +++ b/kernel/workqueue_sched.h | |||
@@ -0,0 +1,9 @@ | |||
1 | /* | ||
2 | * kernel/workqueue_sched.h | ||
3 | * | ||
4 | * Scheduler hooks for concurrency managed workqueue. Only to be | ||
5 | * included from sched.c and workqueue.c. | ||
6 | */ | ||
7 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | ||
8 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
9 | unsigned int cpu); | ||