diff options
Diffstat (limited to 'kernel')
67 files changed, 5489 insertions, 5330 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472fbc272..c53e491e25a8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -76,8 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/ | |||
| 76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 76 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
| 77 | obj-$(CONFIG_KPROBES) += kprobes.o | 77 | obj-$(CONFIG_KPROBES) += kprobes.o |
| 78 | obj-$(CONFIG_KGDB) += debug/ | 78 | obj-$(CONFIG_KGDB) += debug/ |
| 79 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||
| 80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 79 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 80 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | ||
| 81 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 81 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 82 | obj-$(CONFIG_SECCOMP) += seccomp.o | 82 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 83 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 83 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| @@ -99,8 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ | |||
| 99 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
| 100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 100 | obj-$(CONFIG_RING_BUFFER) += trace/ |
| 101 | obj-$(CONFIG_SMP) += sched_cpupri.o | 101 | obj-$(CONFIG_SMP) += sched_cpupri.o |
| 102 | obj-$(CONFIG_SLOW_WORK) += slow-work.o | ||
| 103 | obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o | ||
| 104 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 102 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o |
| 105 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 103 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
| 106 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 104 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18fe..cd9dbb913c77 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. | |||
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
| 52 | #include <linux/bug.h> | ||
| 53 | #include <linux/module.h> | 52 | #include <linux/module.h> |
| 54 | #include <linux/wait.h> | 53 | #include <linux/wait.h> |
| 55 | #include <linux/sched.h> | 54 | #include <linux/sched.h> |
| 56 | #include <linux/init.h> | ||
| 57 | #include <linux/kthread.h> | ||
| 58 | #include <linux/delay.h> | ||
| 59 | #include <linux/slab.h> | 55 | #include <linux/slab.h> |
| 56 | #include <linux/workqueue.h> | ||
| 60 | #include <asm/atomic.h> | 57 | #include <asm/atomic.h> |
| 61 | 58 | ||
| 62 | static async_cookie_t next_cookie = 1; | 59 | static async_cookie_t next_cookie = 1; |
| 63 | 60 | ||
| 64 | #define MAX_THREADS 256 | ||
| 65 | #define MAX_WORK 32768 | 61 | #define MAX_WORK 32768 |
| 66 | 62 | ||
| 67 | static LIST_HEAD(async_pending); | 63 | static LIST_HEAD(async_pending); |
| 68 | static LIST_HEAD(async_running); | 64 | static LIST_HEAD(async_running); |
| 69 | static DEFINE_SPINLOCK(async_lock); | 65 | static DEFINE_SPINLOCK(async_lock); |
| 70 | 66 | ||
| 71 | static int async_enabled = 0; | ||
| 72 | |||
| 73 | struct async_entry { | 67 | struct async_entry { |
| 74 | struct list_head list; | 68 | struct list_head list; |
| 75 | async_cookie_t cookie; | 69 | struct work_struct work; |
| 76 | async_func_ptr *func; | 70 | async_cookie_t cookie; |
| 77 | void *data; | 71 | async_func_ptr *func; |
| 78 | struct list_head *running; | 72 | void *data; |
| 73 | struct list_head *running; | ||
| 79 | }; | 74 | }; |
| 80 | 75 | ||
| 81 | static DECLARE_WAIT_QUEUE_HEAD(async_done); | 76 | static DECLARE_WAIT_QUEUE_HEAD(async_done); |
| 82 | static DECLARE_WAIT_QUEUE_HEAD(async_new); | ||
| 83 | 77 | ||
| 84 | static atomic_t entry_count; | 78 | static atomic_t entry_count; |
| 85 | static atomic_t thread_count; | ||
| 86 | 79 | ||
| 87 | extern int initcall_debug; | 80 | extern int initcall_debug; |
| 88 | 81 | ||
| @@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) | |||
| 117 | spin_unlock_irqrestore(&async_lock, flags); | 110 | spin_unlock_irqrestore(&async_lock, flags); |
| 118 | return ret; | 111 | return ret; |
| 119 | } | 112 | } |
| 113 | |||
| 120 | /* | 114 | /* |
| 121 | * pick the first pending entry and run it | 115 | * pick the first pending entry and run it |
| 122 | */ | 116 | */ |
| 123 | static void run_one_entry(void) | 117 | static void async_run_entry_fn(struct work_struct *work) |
| 124 | { | 118 | { |
| 119 | struct async_entry *entry = | ||
| 120 | container_of(work, struct async_entry, work); | ||
| 125 | unsigned long flags; | 121 | unsigned long flags; |
| 126 | struct async_entry *entry; | ||
| 127 | ktime_t calltime, delta, rettime; | 122 | ktime_t calltime, delta, rettime; |
| 128 | 123 | ||
| 129 | /* 1) pick one task from the pending queue */ | 124 | /* 1) move self to the running queue */ |
| 130 | |||
| 131 | spin_lock_irqsave(&async_lock, flags); | 125 | spin_lock_irqsave(&async_lock, flags); |
| 132 | if (list_empty(&async_pending)) | ||
| 133 | goto out; | ||
| 134 | entry = list_first_entry(&async_pending, struct async_entry, list); | ||
| 135 | |||
| 136 | /* 2) move it to the running queue */ | ||
| 137 | list_move_tail(&entry->list, entry->running); | 126 | list_move_tail(&entry->list, entry->running); |
| 138 | spin_unlock_irqrestore(&async_lock, flags); | 127 | spin_unlock_irqrestore(&async_lock, flags); |
| 139 | 128 | ||
| 140 | /* 3) run it (and print duration)*/ | 129 | /* 2) run (and print duration) */ |
| 141 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 130 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 142 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, | 131 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, |
| 143 | entry->func, task_pid_nr(current)); | 132 | entry->func, task_pid_nr(current)); |
| @@ -153,31 +142,25 @@ static void run_one_entry(void) | |||
| 153 | (long long)ktime_to_ns(delta) >> 10); | 142 | (long long)ktime_to_ns(delta) >> 10); |
| 154 | } | 143 | } |
| 155 | 144 | ||
| 156 | /* 4) remove it from the running queue */ | 145 | /* 3) remove self from the running queue */ |
| 157 | spin_lock_irqsave(&async_lock, flags); | 146 | spin_lock_irqsave(&async_lock, flags); |
| 158 | list_del(&entry->list); | 147 | list_del(&entry->list); |
| 159 | 148 | ||
| 160 | /* 5) free the entry */ | 149 | /* 4) free the entry */ |
| 161 | kfree(entry); | 150 | kfree(entry); |
| 162 | atomic_dec(&entry_count); | 151 | atomic_dec(&entry_count); |
| 163 | 152 | ||
| 164 | spin_unlock_irqrestore(&async_lock, flags); | 153 | spin_unlock_irqrestore(&async_lock, flags); |
| 165 | 154 | ||
| 166 | /* 6) wake up any waiters. */ | 155 | /* 5) wake up any waiters */ |
| 167 | wake_up(&async_done); | 156 | wake_up(&async_done); |
| 168 | return; | ||
| 169 | |||
| 170 | out: | ||
| 171 | spin_unlock_irqrestore(&async_lock, flags); | ||
| 172 | } | 157 | } |
| 173 | 158 | ||
| 174 | |||
| 175 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) | 159 | static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) |
| 176 | { | 160 | { |
| 177 | struct async_entry *entry; | 161 | struct async_entry *entry; |
| 178 | unsigned long flags; | 162 | unsigned long flags; |
| 179 | async_cookie_t newcookie; | 163 | async_cookie_t newcookie; |
| 180 | |||
| 181 | 164 | ||
| 182 | /* allow irq-off callers */ | 165 | /* allow irq-off callers */ |
| 183 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); | 166 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); |
| @@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
| 186 | * If we're out of memory or if there's too much work | 169 | * If we're out of memory or if there's too much work |
| 187 | * pending already, we execute synchronously. | 170 | * pending already, we execute synchronously. |
| 188 | */ | 171 | */ |
| 189 | if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { | 172 | if (!entry || atomic_read(&entry_count) > MAX_WORK) { |
| 190 | kfree(entry); | 173 | kfree(entry); |
| 191 | spin_lock_irqsave(&async_lock, flags); | 174 | spin_lock_irqsave(&async_lock, flags); |
| 192 | newcookie = next_cookie++; | 175 | newcookie = next_cookie++; |
| @@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
| 196 | ptr(data, newcookie); | 179 | ptr(data, newcookie); |
| 197 | return newcookie; | 180 | return newcookie; |
| 198 | } | 181 | } |
| 182 | INIT_WORK(&entry->work, async_run_entry_fn); | ||
| 199 | entry->func = ptr; | 183 | entry->func = ptr; |
| 200 | entry->data = data; | 184 | entry->data = data; |
| 201 | entry->running = running; | 185 | entry->running = running; |
| @@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l | |||
| 205 | list_add_tail(&entry->list, &async_pending); | 189 | list_add_tail(&entry->list, &async_pending); |
| 206 | atomic_inc(&entry_count); | 190 | atomic_inc(&entry_count); |
| 207 | spin_unlock_irqrestore(&async_lock, flags); | 191 | spin_unlock_irqrestore(&async_lock, flags); |
| 208 | wake_up(&async_new); | 192 | |
| 193 | /* schedule for execution */ | ||
| 194 | queue_work(system_unbound_wq, &entry->work); | ||
| 195 | |||
| 209 | return newcookie; | 196 | return newcookie; |
| 210 | } | 197 | } |
| 211 | 198 | ||
| @@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) | |||
| 312 | async_synchronize_cookie_domain(cookie, &async_running); | 299 | async_synchronize_cookie_domain(cookie, &async_running); |
| 313 | } | 300 | } |
| 314 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); | 301 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); |
| 315 | |||
| 316 | |||
| 317 | static int async_thread(void *unused) | ||
| 318 | { | ||
| 319 | DECLARE_WAITQUEUE(wq, current); | ||
| 320 | add_wait_queue(&async_new, &wq); | ||
| 321 | |||
| 322 | while (!kthread_should_stop()) { | ||
| 323 | int ret = HZ; | ||
| 324 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 325 | /* | ||
| 326 | * check the list head without lock.. false positives | ||
| 327 | * are dealt with inside run_one_entry() while holding | ||
| 328 | * the lock. | ||
| 329 | */ | ||
| 330 | rmb(); | ||
| 331 | if (!list_empty(&async_pending)) | ||
| 332 | run_one_entry(); | ||
| 333 | else | ||
| 334 | ret = schedule_timeout(HZ); | ||
| 335 | |||
| 336 | if (ret == 0) { | ||
| 337 | /* | ||
| 338 | * we timed out, this means we as thread are redundant. | ||
| 339 | * we sign off and die, but we to avoid any races there | ||
| 340 | * is a last-straw check to see if work snuck in. | ||
| 341 | */ | ||
| 342 | atomic_dec(&thread_count); | ||
| 343 | wmb(); /* manager must see our departure first */ | ||
| 344 | if (list_empty(&async_pending)) | ||
| 345 | break; | ||
| 346 | /* | ||
| 347 | * woops work came in between us timing out and us | ||
| 348 | * signing off; we need to stay alive and keep working. | ||
| 349 | */ | ||
| 350 | atomic_inc(&thread_count); | ||
| 351 | } | ||
| 352 | } | ||
| 353 | remove_wait_queue(&async_new, &wq); | ||
| 354 | |||
| 355 | return 0; | ||
| 356 | } | ||
| 357 | |||
| 358 | static int async_manager_thread(void *unused) | ||
| 359 | { | ||
| 360 | DECLARE_WAITQUEUE(wq, current); | ||
| 361 | add_wait_queue(&async_new, &wq); | ||
| 362 | |||
| 363 | while (!kthread_should_stop()) { | ||
| 364 | int tc, ec; | ||
| 365 | |||
| 366 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 367 | |||
| 368 | tc = atomic_read(&thread_count); | ||
| 369 | rmb(); | ||
| 370 | ec = atomic_read(&entry_count); | ||
| 371 | |||
| 372 | while (tc < ec && tc < MAX_THREADS) { | ||
| 373 | if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", | ||
| 374 | tc))) { | ||
| 375 | msleep(100); | ||
| 376 | continue; | ||
| 377 | } | ||
| 378 | atomic_inc(&thread_count); | ||
| 379 | tc++; | ||
| 380 | } | ||
| 381 | |||
| 382 | schedule(); | ||
| 383 | } | ||
| 384 | remove_wait_queue(&async_new, &wq); | ||
| 385 | |||
| 386 | return 0; | ||
| 387 | } | ||
| 388 | |||
| 389 | static int __init async_init(void) | ||
| 390 | { | ||
| 391 | async_enabled = | ||
| 392 | !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); | ||
| 393 | |||
| 394 | WARN_ON(!async_enabled); | ||
| 395 | return 0; | ||
| 396 | } | ||
| 397 | |||
| 398 | core_initcall(async_init); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a8ce09954404..d83cab06da87 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { | |||
| 1623 | .kill_sb = cgroup_kill_sb, | 1623 | .kill_sb = cgroup_kill_sb, |
| 1624 | }; | 1624 | }; |
| 1625 | 1625 | ||
| 1626 | static struct kobject *cgroup_kobj; | ||
| 1627 | |||
| 1626 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) | 1628 | static inline struct cgroup *__d_cgrp(struct dentry *dentry) |
| 1627 | { | 1629 | { |
| 1628 | return dentry->d_fsdata; | 1630 | return dentry->d_fsdata; |
| @@ -3894,9 +3896,18 @@ int __init cgroup_init(void) | |||
| 3894 | hhead = css_set_hash(init_css_set.subsys); | 3896 | hhead = css_set_hash(init_css_set.subsys); |
| 3895 | hlist_add_head(&init_css_set.hlist, hhead); | 3897 | hlist_add_head(&init_css_set.hlist, hhead); |
| 3896 | BUG_ON(!init_root_id(&rootnode)); | 3898 | BUG_ON(!init_root_id(&rootnode)); |
| 3899 | |||
| 3900 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | ||
| 3901 | if (!cgroup_kobj) { | ||
| 3902 | err = -ENOMEM; | ||
| 3903 | goto out; | ||
| 3904 | } | ||
| 3905 | |||
| 3897 | err = register_filesystem(&cgroup_fs_type); | 3906 | err = register_filesystem(&cgroup_fs_type); |
| 3898 | if (err < 0) | 3907 | if (err < 0) { |
| 3908 | kobject_put(cgroup_kobj); | ||
| 3899 | goto out; | 3909 | goto out; |
| 3910 | } | ||
| 3900 | 3911 | ||
| 3901 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | 3912 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); |
| 3902 | 3913 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 235 | return -EINVAL; | 235 | return -EINVAL; |
| 236 | 236 | ||
| 237 | cpu_hotplug_begin(); | 237 | cpu_hotplug_begin(); |
| 238 | set_cpu_active(cpu, false); | ||
| 239 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 238 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
| 240 | if (err) { | 239 | if (err) { |
| 241 | set_cpu_active(cpu, true); | ||
| 242 | |||
| 243 | nr_calls--; | 240 | nr_calls--; |
| 244 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 241 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
| 245 | printk("%s: attempt to take down CPU %u failed\n", | 242 | printk("%s: attempt to take down CPU %u failed\n", |
| @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 249 | 246 | ||
| 250 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 247 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
| 251 | if (err) { | 248 | if (err) { |
| 252 | set_cpu_active(cpu, true); | ||
| 253 | /* CPU didn't die: tell everyone. Can't complain. */ | 249 | /* CPU didn't die: tell everyone. Can't complain. */ |
| 254 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 250 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
| 255 | 251 | ||
| @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 321 | goto out_notify; | 317 | goto out_notify; |
| 322 | BUG_ON(!cpu_online(cpu)); | 318 | BUG_ON(!cpu_online(cpu)); |
| 323 | 319 | ||
| 324 | set_cpu_active(cpu, true); | ||
| 325 | |||
| 326 | /* Now call notifier in preparation. */ | 320 | /* Now call notifier in preparation. */ |
| 327 | cpu_notify(CPU_ONLINE | mod, hcpu); | 321 | cpu_notify(CPU_ONLINE | mod, hcpu); |
| 328 | 322 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7cb37d86a005..b23c0979bbe7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
| 2113 | * but making no active use of cpusets. | 2113 | * but making no active use of cpusets. |
| 2114 | * | 2114 | * |
| 2115 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2115 | * This routine ensures that top_cpuset.cpus_allowed tracks |
| 2116 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 2116 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
| 2117 | * | 2117 | * |
| 2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
| 2119 | * before calling generate_sched_domains(). | 2119 | * before calling generate_sched_domains(). |
| 2120 | */ | 2120 | */ |
| 2121 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | 2121 | void cpuset_update_active_cpus(void) |
| 2122 | unsigned long phase, void *unused_cpu) | ||
| 2123 | { | 2122 | { |
| 2124 | struct sched_domain_attr *attr; | 2123 | struct sched_domain_attr *attr; |
| 2125 | cpumask_var_t *doms; | 2124 | cpumask_var_t *doms; |
| 2126 | int ndoms; | 2125 | int ndoms; |
| 2127 | 2126 | ||
| 2128 | switch (phase) { | ||
| 2129 | case CPU_ONLINE: | ||
| 2130 | case CPU_ONLINE_FROZEN: | ||
| 2131 | case CPU_DOWN_PREPARE: | ||
| 2132 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 2133 | case CPU_DOWN_FAILED: | ||
| 2134 | case CPU_DOWN_FAILED_FROZEN: | ||
| 2135 | break; | ||
| 2136 | |||
| 2137 | default: | ||
| 2138 | return NOTIFY_DONE; | ||
| 2139 | } | ||
| 2140 | |||
| 2141 | cgroup_lock(); | 2127 | cgroup_lock(); |
| 2142 | mutex_lock(&callback_mutex); | 2128 | mutex_lock(&callback_mutex); |
| 2143 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2129 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
| @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
| 2148 | 2134 | ||
| 2149 | /* Have scheduler rebuild the domains */ | 2135 | /* Have scheduler rebuild the domains */ |
| 2150 | partition_sched_domains(ndoms, doms, attr); | 2136 | partition_sched_domains(ndoms, doms, attr); |
| 2151 | |||
| 2152 | return NOTIFY_OK; | ||
| 2153 | } | 2137 | } |
| 2154 | 2138 | ||
| 2155 | #ifdef CONFIG_MEMORY_HOTPLUG | 2139 | #ifdef CONFIG_MEMORY_HOTPLUG |
| @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) | |||
| 2203 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2187 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
| 2204 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2188 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
| 2205 | 2189 | ||
| 2206 | hotcpu_notifier(cpuset_track_online_cpus, 0); | ||
| 2207 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2190 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
| 2208 | 2191 | ||
| 2209 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2192 | cpuset_wq = create_singlethread_workqueue("cpuset"); |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 8577e45a9a58..28b844118bbd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -2548,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) | |||
| 2548 | */ | 2548 | */ |
| 2549 | static int kdb_summary(int argc, const char **argv) | 2549 | static int kdb_summary(int argc, const char **argv) |
| 2550 | { | 2550 | { |
| 2551 | struct timespec now; | ||
| 2551 | struct kdb_tm tm; | 2552 | struct kdb_tm tm; |
| 2552 | struct sysinfo val; | 2553 | struct sysinfo val; |
| 2553 | 2554 | ||
| @@ -2562,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) | |||
| 2562 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); | 2563 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); |
| 2563 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); | 2564 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); |
| 2564 | 2565 | ||
| 2565 | kdb_gmtime(&xtime, &tm); | 2566 | now = __current_kernel_time(); |
| 2567 | kdb_gmtime(&now, &tm); | ||
| 2566 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " | 2568 | kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " |
| 2567 | "tz_minuteswest %d\n", | 2569 | "tz_minuteswest %d\n", |
| 2568 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, | 2570 | 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, |
diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba047..a82a65cef741 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
| 907 | { | 907 | { |
| 908 | unsigned long new_flags = p->flags; | 908 | unsigned long new_flags = p->flags; |
| 909 | 909 | ||
| 910 | new_flags &= ~PF_SUPERPRIV; | 910 | new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); |
| 911 | new_flags |= PF_FORKNOEXEC; | 911 | new_flags |= PF_FORKNOEXEC; |
| 912 | new_flags |= PF_STARTING; | 912 | new_flags |= PF_STARTING; |
| 913 | p->flags = new_flags; | 913 | p->flags = new_flags; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0f..ce669174f355 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) | |||
| 90 | do { | 90 | do { |
| 91 | seq = read_seqbegin(&xtime_lock); | 91 | seq = read_seqbegin(&xtime_lock); |
| 92 | xts = __current_kernel_time(); | 92 | xts = __current_kernel_time(); |
| 93 | tom = wall_to_monotonic; | 93 | tom = __get_wall_to_monotonic(); |
| 94 | } while (read_seqretry(&xtime_lock, seq)); | 94 | } while (read_seqretry(&xtime_lock, seq)); |
| 95 | 95 | ||
| 96 | xtim = timespec_to_ktime(xts); | 96 | xtim = timespec_to_ktime(xts); |
| @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
| 144 | static int hrtimer_get_target(int this_cpu, int pinned) | 144 | static int hrtimer_get_target(int this_cpu, int pinned) |
| 145 | { | 145 | { |
| 146 | #ifdef CONFIG_NO_HZ | 146 | #ifdef CONFIG_NO_HZ |
| 147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { | 147 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
| 148 | int preferred_cpu = get_nohz_load_balancer(); | 148 | return get_nohz_timer_target(); |
| 149 | |||
| 150 | if (preferred_cpu >= 0) | ||
| 151 | return preferred_cpu; | ||
| 152 | } | ||
| 153 | #endif | 149 | #endif |
| 154 | return this_cpu; | 150 | return this_cpu; |
| 155 | } | 151 | } |
| @@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
| 612 | static void retrigger_next_event(void *arg) | 608 | static void retrigger_next_event(void *arg) |
| 613 | { | 609 | { |
| 614 | struct hrtimer_cpu_base *base; | 610 | struct hrtimer_cpu_base *base; |
| 615 | struct timespec realtime_offset; | 611 | struct timespec realtime_offset, wtm; |
| 616 | unsigned long seq; | 612 | unsigned long seq; |
| 617 | 613 | ||
| 618 | if (!hrtimer_hres_active()) | 614 | if (!hrtimer_hres_active()) |
| @@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) | |||
| 620 | 616 | ||
| 621 | do { | 617 | do { |
| 622 | seq = read_seqbegin(&xtime_lock); | 618 | seq = read_seqbegin(&xtime_lock); |
| 623 | set_normalized_timespec(&realtime_offset, | 619 | wtm = __get_wall_to_monotonic(); |
| 624 | -wall_to_monotonic.tv_sec, | ||
| 625 | -wall_to_monotonic.tv_nsec); | ||
| 626 | } while (read_seqretry(&xtime_lock, seq)); | 620 | } while (read_seqretry(&xtime_lock, seq)); |
| 621 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
| 627 | 622 | ||
| 628 | base = &__get_cpu_var(hrtimer_bases); | 623 | base = &__get_cpu_var(hrtimer_bases); |
| 629 | 624 | ||
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 71ed3ce29e12..d71a987fd2bf 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
| 42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
| 43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
| 44 | #include <linux/list.h> | ||
| 44 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
| 45 | #include <linux/smp.h> | 46 | #include <linux/smp.h> |
| 46 | 47 | ||
| @@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); | |||
| 62 | 63 | ||
| 63 | static int nr_slots[TYPE_MAX]; | 64 | static int nr_slots[TYPE_MAX]; |
| 64 | 65 | ||
| 66 | /* Keep track of the breakpoints attached to tasks */ | ||
| 67 | static LIST_HEAD(bp_task_head); | ||
| 68 | |||
| 65 | static int constraints_initialized; | 69 | static int constraints_initialized; |
| 66 | 70 | ||
| 67 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ | 71 | /* Gather the number of total pinned and un-pinned bp in a cpuset */ |
| @@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
| 103 | return 0; | 107 | return 0; |
| 104 | } | 108 | } |
| 105 | 109 | ||
| 106 | static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) | 110 | /* |
| 111 | * Count the number of breakpoints of the same type and same task. | ||
| 112 | * The given event must be not on the list. | ||
| 113 | */ | ||
| 114 | static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) | ||
| 107 | { | 115 | { |
| 108 | struct perf_event_context *ctx = tsk->perf_event_ctxp; | 116 | struct perf_event_context *ctx = bp->ctx; |
| 109 | struct list_head *list; | 117 | struct perf_event *iter; |
| 110 | struct perf_event *bp; | ||
| 111 | unsigned long flags; | ||
| 112 | int count = 0; | 118 | int count = 0; |
| 113 | 119 | ||
| 114 | if (WARN_ONCE(!ctx, "No perf context for this task")) | 120 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { |
| 115 | return 0; | 121 | if (iter->ctx == ctx && find_slot_idx(iter) == type) |
| 116 | 122 | count += hw_breakpoint_weight(iter); | |
| 117 | list = &ctx->event_list; | ||
| 118 | |||
| 119 | raw_spin_lock_irqsave(&ctx->lock, flags); | ||
| 120 | |||
| 121 | /* | ||
| 122 | * The current breakpoint counter is not included in the list | ||
| 123 | * at the open() callback time | ||
| 124 | */ | ||
| 125 | list_for_each_entry(bp, list, event_entry) { | ||
| 126 | if (bp->attr.type == PERF_TYPE_BREAKPOINT) | ||
| 127 | if (find_slot_idx(bp) == type) | ||
| 128 | count += hw_breakpoint_weight(bp); | ||
| 129 | } | 123 | } |
| 130 | 124 | ||
| 131 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | ||
| 132 | |||
| 133 | return count; | 125 | return count; |
| 134 | } | 126 | } |
| 135 | 127 | ||
| @@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
| 149 | if (!tsk) | 141 | if (!tsk) |
| 150 | slots->pinned += max_task_bp_pinned(cpu, type); | 142 | slots->pinned += max_task_bp_pinned(cpu, type); |
| 151 | else | 143 | else |
| 152 | slots->pinned += task_bp_pinned(tsk, type); | 144 | slots->pinned += task_bp_pinned(bp, type); |
| 153 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); | 145 | slots->flexible = per_cpu(nr_bp_flexible[type], cpu); |
| 154 | 146 | ||
| 155 | return; | 147 | return; |
| @@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
| 162 | if (!tsk) | 154 | if (!tsk) |
| 163 | nr += max_task_bp_pinned(cpu, type); | 155 | nr += max_task_bp_pinned(cpu, type); |
| 164 | else | 156 | else |
| 165 | nr += task_bp_pinned(tsk, type); | 157 | nr += task_bp_pinned(bp, type); |
| 166 | 158 | ||
| 167 | if (nr > slots->pinned) | 159 | if (nr > slots->pinned) |
| 168 | slots->pinned = nr; | 160 | slots->pinned = nr; |
| @@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) | |||
| 188 | /* | 180 | /* |
| 189 | * Add a pinned breakpoint for the given task in our constraint table | 181 | * Add a pinned breakpoint for the given task in our constraint table |
| 190 | */ | 182 | */ |
| 191 | static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | 183 | static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, |
| 192 | enum bp_type_idx type, int weight) | 184 | enum bp_type_idx type, int weight) |
| 193 | { | 185 | { |
| 194 | unsigned int *tsk_pinned; | 186 | unsigned int *tsk_pinned; |
| @@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, | |||
| 196 | int old_idx = 0; | 188 | int old_idx = 0; |
| 197 | int idx = 0; | 189 | int idx = 0; |
| 198 | 190 | ||
| 199 | old_count = task_bp_pinned(tsk, type); | 191 | old_count = task_bp_pinned(bp, type); |
| 200 | old_idx = old_count - 1; | 192 | old_idx = old_count - 1; |
| 201 | idx = old_idx + weight; | 193 | idx = old_idx + weight; |
| 202 | 194 | ||
| 195 | /* tsk_pinned[n] is the number of tasks having n breakpoints */ | ||
| 203 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); | 196 | tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); |
| 204 | if (enable) { | 197 | if (enable) { |
| 205 | tsk_pinned[idx]++; | 198 | tsk_pinned[idx]++; |
| @@ -222,23 +215,30 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
| 222 | int cpu = bp->cpu; | 215 | int cpu = bp->cpu; |
| 223 | struct task_struct *tsk = bp->ctx->task; | 216 | struct task_struct *tsk = bp->ctx->task; |
| 224 | 217 | ||
| 218 | /* Pinned counter cpu profiling */ | ||
| 219 | if (!tsk) { | ||
| 220 | |||
| 221 | if (enable) | ||
| 222 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | ||
| 223 | else | ||
| 224 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | ||
| 225 | return; | ||
| 226 | } | ||
| 227 | |||
| 225 | /* Pinned counter task profiling */ | 228 | /* Pinned counter task profiling */ |
| 226 | if (tsk) { | ||
| 227 | if (cpu >= 0) { | ||
| 228 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | ||
| 229 | return; | ||
| 230 | } | ||
| 231 | 229 | ||
| 230 | if (!enable) | ||
| 231 | list_del(&bp->hw.bp_list); | ||
| 232 | |||
| 233 | if (cpu >= 0) { | ||
| 234 | toggle_bp_task_slot(bp, cpu, enable, type, weight); | ||
| 235 | } else { | ||
| 232 | for_each_online_cpu(cpu) | 236 | for_each_online_cpu(cpu) |
| 233 | toggle_bp_task_slot(tsk, cpu, enable, type, weight); | 237 | toggle_bp_task_slot(bp, cpu, enable, type, weight); |
| 234 | return; | ||
| 235 | } | 238 | } |
| 236 | 239 | ||
| 237 | /* Pinned counter cpu profiling */ | ||
| 238 | if (enable) | 240 | if (enable) |
| 239 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; | 241 | list_add_tail(&bp->hw.bp_list, &bp_task_head); |
| 240 | else | ||
| 241 | per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; | ||
| 242 | } | 242 | } |
| 243 | 243 | ||
| 244 | /* | 244 | /* |
| @@ -312,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) | |||
| 312 | weight = hw_breakpoint_weight(bp); | 312 | weight = hw_breakpoint_weight(bp); |
| 313 | 313 | ||
| 314 | fetch_bp_busy_slots(&slots, bp, type); | 314 | fetch_bp_busy_slots(&slots, bp, type); |
| 315 | /* | ||
| 316 | * Simulate the addition of this breakpoint to the constraints | ||
| 317 | * and see the result. | ||
| 318 | */ | ||
| 315 | fetch_this_slot(&slots, weight); | 319 | fetch_this_slot(&slots, weight); |
| 316 | 320 | ||
| 317 | /* Flexible counters need to keep at least one slot */ | 321 | /* Flexible counters need to keep at least one slot */ |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8a..c3003e9d91a3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) | |||
| 216 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | 216 | void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) |
| 217 | { | 217 | { |
| 218 | if (suspend) { | 218 | if (suspend) { |
| 219 | if (!desc->action || (desc->action->flags & IRQF_TIMER)) | 219 | if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) |
| 220 | return; | 220 | return; |
| 221 | desc->status |= IRQ_SUSPENDED; | 221 | desc->status |= IRQ_SUSPENDED; |
| 222 | } | 222 | } |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c780175..2dc3786349d1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -14,6 +14,8 @@ | |||
| 14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
| 17 | #include <linux/slab.h> | ||
| 18 | #include <linux/freezer.h> | ||
| 17 | #include <trace/events/sched.h> | 19 | #include <trace/events/sched.h> |
| 18 | 20 | ||
| 19 | static DEFINE_SPINLOCK(kthread_create_lock); | 21 | static DEFINE_SPINLOCK(kthread_create_lock); |
| @@ -35,6 +37,7 @@ struct kthread_create_info | |||
| 35 | 37 | ||
| 36 | struct kthread { | 38 | struct kthread { |
| 37 | int should_stop; | 39 | int should_stop; |
| 40 | void *data; | ||
| 38 | struct completion exited; | 41 | struct completion exited; |
| 39 | }; | 42 | }; |
| 40 | 43 | ||
| @@ -54,6 +57,19 @@ int kthread_should_stop(void) | |||
| 54 | } | 57 | } |
| 55 | EXPORT_SYMBOL(kthread_should_stop); | 58 | EXPORT_SYMBOL(kthread_should_stop); |
| 56 | 59 | ||
| 60 | /** | ||
| 61 | * kthread_data - return data value specified on kthread creation | ||
| 62 | * @task: kthread task in question | ||
| 63 | * | ||
| 64 | * Return the data value specified when kthread @task was created. | ||
| 65 | * The caller is responsible for ensuring the validity of @task when | ||
| 66 | * calling this function. | ||
| 67 | */ | ||
| 68 | void *kthread_data(struct task_struct *task) | ||
| 69 | { | ||
| 70 | return to_kthread(task)->data; | ||
| 71 | } | ||
| 72 | |||
| 57 | static int kthread(void *_create) | 73 | static int kthread(void *_create) |
| 58 | { | 74 | { |
| 59 | /* Copy data: it's on kthread's stack */ | 75 | /* Copy data: it's on kthread's stack */ |
| @@ -64,6 +80,7 @@ static int kthread(void *_create) | |||
| 64 | int ret; | 80 | int ret; |
| 65 | 81 | ||
| 66 | self.should_stop = 0; | 82 | self.should_stop = 0; |
| 83 | self.data = data; | ||
| 67 | init_completion(&self.exited); | 84 | init_completion(&self.exited); |
| 68 | current->vfork_done = &self.exited; | 85 | current->vfork_done = &self.exited; |
| 69 | 86 | ||
| @@ -247,3 +264,150 @@ int kthreadd(void *unused) | |||
| 247 | 264 | ||
| 248 | return 0; | 265 | return 0; |
| 249 | } | 266 | } |
| 267 | |||
| 268 | /** | ||
| 269 | * kthread_worker_fn - kthread function to process kthread_worker | ||
| 270 | * @worker_ptr: pointer to initialized kthread_worker | ||
| 271 | * | ||
| 272 | * This function can be used as @threadfn to kthread_create() or | ||
| 273 | * kthread_run() with @worker_ptr argument pointing to an initialized | ||
| 274 | * kthread_worker. The started kthread will process work_list until | ||
| 275 | * the it is stopped with kthread_stop(). A kthread can also call | ||
| 276 | * this function directly after extra initialization. | ||
| 277 | * | ||
| 278 | * Different kthreads can be used for the same kthread_worker as long | ||
| 279 | * as there's only one kthread attached to it at any given time. A | ||
| 280 | * kthread_worker without an attached kthread simply collects queued | ||
| 281 | * kthread_works. | ||
| 282 | */ | ||
| 283 | int kthread_worker_fn(void *worker_ptr) | ||
| 284 | { | ||
| 285 | struct kthread_worker *worker = worker_ptr; | ||
| 286 | struct kthread_work *work; | ||
| 287 | |||
| 288 | WARN_ON(worker->task); | ||
| 289 | worker->task = current; | ||
| 290 | repeat: | ||
| 291 | set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ | ||
| 292 | |||
| 293 | if (kthread_should_stop()) { | ||
| 294 | __set_current_state(TASK_RUNNING); | ||
| 295 | spin_lock_irq(&worker->lock); | ||
| 296 | worker->task = NULL; | ||
| 297 | spin_unlock_irq(&worker->lock); | ||
| 298 | return 0; | ||
| 299 | } | ||
| 300 | |||
| 301 | work = NULL; | ||
| 302 | spin_lock_irq(&worker->lock); | ||
| 303 | if (!list_empty(&worker->work_list)) { | ||
| 304 | work = list_first_entry(&worker->work_list, | ||
| 305 | struct kthread_work, node); | ||
| 306 | list_del_init(&work->node); | ||
| 307 | } | ||
| 308 | spin_unlock_irq(&worker->lock); | ||
| 309 | |||
| 310 | if (work) { | ||
| 311 | __set_current_state(TASK_RUNNING); | ||
| 312 | work->func(work); | ||
| 313 | smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ | ||
| 314 | work->done_seq = work->queue_seq; | ||
| 315 | smp_mb(); /* mb worker-b1 paired with flush-b0 */ | ||
| 316 | if (atomic_read(&work->flushing)) | ||
| 317 | wake_up_all(&work->done); | ||
| 318 | } else if (!freezing(current)) | ||
| 319 | schedule(); | ||
| 320 | |||
| 321 | try_to_freeze(); | ||
| 322 | goto repeat; | ||
| 323 | } | ||
| 324 | EXPORT_SYMBOL_GPL(kthread_worker_fn); | ||
| 325 | |||
| 326 | /** | ||
| 327 | * queue_kthread_work - queue a kthread_work | ||
| 328 | * @worker: target kthread_worker | ||
| 329 | * @work: kthread_work to queue | ||
| 330 | * | ||
| 331 | * Queue @work to work processor @task for async execution. @task | ||
| 332 | * must have been created with kthread_worker_create(). Returns %true | ||
| 333 | * if @work was successfully queued, %false if it was already pending. | ||
| 334 | */ | ||
| 335 | bool queue_kthread_work(struct kthread_worker *worker, | ||
| 336 | struct kthread_work *work) | ||
| 337 | { | ||
| 338 | bool ret = false; | ||
| 339 | unsigned long flags; | ||
| 340 | |||
| 341 | spin_lock_irqsave(&worker->lock, flags); | ||
| 342 | if (list_empty(&work->node)) { | ||
| 343 | list_add_tail(&work->node, &worker->work_list); | ||
| 344 | work->queue_seq++; | ||
| 345 | if (likely(worker->task)) | ||
| 346 | wake_up_process(worker->task); | ||
| 347 | ret = true; | ||
| 348 | } | ||
| 349 | spin_unlock_irqrestore(&worker->lock, flags); | ||
| 350 | return ret; | ||
| 351 | } | ||
| 352 | EXPORT_SYMBOL_GPL(queue_kthread_work); | ||
| 353 | |||
| 354 | /** | ||
| 355 | * flush_kthread_work - flush a kthread_work | ||
| 356 | * @work: work to flush | ||
| 357 | * | ||
| 358 | * If @work is queued or executing, wait for it to finish execution. | ||
| 359 | */ | ||
| 360 | void flush_kthread_work(struct kthread_work *work) | ||
| 361 | { | ||
| 362 | int seq = work->queue_seq; | ||
| 363 | |||
| 364 | atomic_inc(&work->flushing); | ||
| 365 | |||
| 366 | /* | ||
| 367 | * mb flush-b0 paired with worker-b1, to make sure either | ||
| 368 | * worker sees the above increment or we see done_seq update. | ||
| 369 | */ | ||
| 370 | smp_mb__after_atomic_inc(); | ||
| 371 | |||
| 372 | /* A - B <= 0 tests whether B is in front of A regardless of overflow */ | ||
| 373 | wait_event(work->done, seq - work->done_seq <= 0); | ||
| 374 | atomic_dec(&work->flushing); | ||
| 375 | |||
| 376 | /* | ||
| 377 | * rmb flush-b1 paired with worker-b0, to make sure our caller | ||
| 378 | * sees every change made by work->func(). | ||
| 379 | */ | ||
| 380 | smp_mb__after_atomic_dec(); | ||
| 381 | } | ||
| 382 | EXPORT_SYMBOL_GPL(flush_kthread_work); | ||
| 383 | |||
| 384 | struct kthread_flush_work { | ||
| 385 | struct kthread_work work; | ||
| 386 | struct completion done; | ||
| 387 | }; | ||
| 388 | |||
| 389 | static void kthread_flush_work_fn(struct kthread_work *work) | ||
| 390 | { | ||
| 391 | struct kthread_flush_work *fwork = | ||
| 392 | container_of(work, struct kthread_flush_work, work); | ||
| 393 | complete(&fwork->done); | ||
| 394 | } | ||
| 395 | |||
| 396 | /** | ||
| 397 | * flush_kthread_worker - flush all current works on a kthread_worker | ||
| 398 | * @worker: worker to flush | ||
| 399 | * | ||
| 400 | * Wait until all currently executing or pending works on @worker are | ||
| 401 | * finished. | ||
| 402 | */ | ||
| 403 | void flush_kthread_worker(struct kthread_worker *worker) | ||
| 404 | { | ||
| 405 | struct kthread_flush_work fwork = { | ||
| 406 | KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), | ||
| 407 | COMPLETION_INITIALIZER_ONSTACK(fwork.done), | ||
| 408 | }; | ||
| 409 | |||
| 410 | queue_kthread_work(worker, &fwork.work); | ||
| 411 | wait_for_completion(&fwork.done); | ||
| 412 | } | ||
| 413 | EXPORT_SYMBOL_GPL(flush_kthread_worker); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37b..f2852a510232 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], | |||
| 146 | 146 | ||
| 147 | static inline u64 lockstat_clock(void) | 147 | static inline u64 lockstat_clock(void) |
| 148 | { | 148 | { |
| 149 | return cpu_clock(smp_processor_id()); | 149 | return local_clock(); |
| 150 | } | 150 | } |
| 151 | 151 | ||
| 152 | static int lock_point(unsigned long points[], unsigned long ip) | 152 | static int lock_point(unsigned long points[], unsigned long ip) |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4c..403d1804b198 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
| @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
| 214 | 214 | ||
| 215 | static inline u64 perf_clock(void) | 215 | static inline u64 perf_clock(void) |
| 216 | { | 216 | { |
| 217 | return cpu_clock(raw_smp_processor_id()); | 217 | return local_clock(); |
| 218 | } | 218 | } |
| 219 | 219 | ||
| 220 | /* | 220 | /* |
| @@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event, | |||
| 675 | struct perf_event *event, *partial_group = NULL; | 675 | struct perf_event *event, *partial_group = NULL; |
| 676 | const struct pmu *pmu = group_event->pmu; | 676 | const struct pmu *pmu = group_event->pmu; |
| 677 | bool txn = false; | 677 | bool txn = false; |
| 678 | int ret; | ||
| 679 | 678 | ||
| 680 | if (group_event->state == PERF_EVENT_STATE_OFF) | 679 | if (group_event->state == PERF_EVENT_STATE_OFF) |
| 681 | return 0; | 680 | return 0; |
| @@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event, | |||
| 703 | } | 702 | } |
| 704 | } | 703 | } |
| 705 | 704 | ||
| 706 | if (!txn) | 705 | if (!txn || !pmu->commit_txn(pmu)) |
| 707 | return 0; | ||
| 708 | |||
| 709 | ret = pmu->commit_txn(pmu); | ||
| 710 | if (!ret) { | ||
| 711 | pmu->cancel_txn(pmu); | ||
| 712 | return 0; | 706 | return 0; |
| 713 | } | ||
| 714 | 707 | ||
| 715 | group_error: | 708 | group_error: |
| 716 | /* | 709 | /* |
| @@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event, | |||
| 1155 | * In order to keep per-task stats reliable we need to flip the event | 1148 | * In order to keep per-task stats reliable we need to flip the event |
| 1156 | * values when we flip the contexts. | 1149 | * values when we flip the contexts. |
| 1157 | */ | 1150 | */ |
| 1158 | value = atomic64_read(&next_event->count); | 1151 | value = local64_read(&next_event->count); |
| 1159 | value = atomic64_xchg(&event->count, value); | 1152 | value = local64_xchg(&event->count, value); |
| 1160 | atomic64_set(&next_event->count, value); | 1153 | local64_set(&next_event->count, value); |
| 1161 | 1154 | ||
| 1162 | swap(event->total_time_enabled, next_event->total_time_enabled); | 1155 | swap(event->total_time_enabled, next_event->total_time_enabled); |
| 1163 | swap(event->total_time_running, next_event->total_time_running); | 1156 | swap(event->total_time_running, next_event->total_time_running); |
| @@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) | |||
| 1547 | 1540 | ||
| 1548 | hwc->sample_period = sample_period; | 1541 | hwc->sample_period = sample_period; |
| 1549 | 1542 | ||
| 1550 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { | 1543 | if (local64_read(&hwc->period_left) > 8*sample_period) { |
| 1551 | perf_disable(); | 1544 | perf_disable(); |
| 1552 | perf_event_stop(event); | 1545 | perf_event_stop(event); |
| 1553 | atomic64_set(&hwc->period_left, 0); | 1546 | local64_set(&hwc->period_left, 0); |
| 1554 | perf_event_start(event); | 1547 | perf_event_start(event); |
| 1555 | perf_enable(); | 1548 | perf_enable(); |
| 1556 | } | 1549 | } |
| @@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
| 1591 | 1584 | ||
| 1592 | perf_disable(); | 1585 | perf_disable(); |
| 1593 | event->pmu->read(event); | 1586 | event->pmu->read(event); |
| 1594 | now = atomic64_read(&event->count); | 1587 | now = local64_read(&event->count); |
| 1595 | delta = now - hwc->freq_count_stamp; | 1588 | delta = now - hwc->freq_count_stamp; |
| 1596 | hwc->freq_count_stamp = now; | 1589 | hwc->freq_count_stamp = now; |
| 1597 | 1590 | ||
| @@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info) | |||
| 1743 | event->pmu->read(event); | 1736 | event->pmu->read(event); |
| 1744 | } | 1737 | } |
| 1745 | 1738 | ||
| 1739 | static inline u64 perf_event_count(struct perf_event *event) | ||
| 1740 | { | ||
| 1741 | return local64_read(&event->count) + atomic64_read(&event->child_count); | ||
| 1742 | } | ||
| 1743 | |||
| 1746 | static u64 perf_event_read(struct perf_event *event) | 1744 | static u64 perf_event_read(struct perf_event *event) |
| 1747 | { | 1745 | { |
| 1748 | /* | 1746 | /* |
| @@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event) | |||
| 1762 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 1760 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 1763 | } | 1761 | } |
| 1764 | 1762 | ||
| 1765 | return atomic64_read(&event->count); | 1763 | return perf_event_count(event); |
| 1766 | } | 1764 | } |
| 1767 | 1765 | ||
| 1768 | /* | 1766 | /* |
| @@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 1883 | } | 1881 | } |
| 1884 | 1882 | ||
| 1885 | static void perf_pending_sync(struct perf_event *event); | 1883 | static void perf_pending_sync(struct perf_event *event); |
| 1886 | static void perf_mmap_data_put(struct perf_mmap_data *data); | 1884 | static void perf_buffer_put(struct perf_buffer *buffer); |
| 1887 | 1885 | ||
| 1888 | static void free_event(struct perf_event *event) | 1886 | static void free_event(struct perf_event *event) |
| 1889 | { | 1887 | { |
| @@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event) | |||
| 1891 | 1889 | ||
| 1892 | if (!event->parent) { | 1890 | if (!event->parent) { |
| 1893 | atomic_dec(&nr_events); | 1891 | atomic_dec(&nr_events); |
| 1894 | if (event->attr.mmap) | 1892 | if (event->attr.mmap || event->attr.mmap_data) |
| 1895 | atomic_dec(&nr_mmap_events); | 1893 | atomic_dec(&nr_mmap_events); |
| 1896 | if (event->attr.comm) | 1894 | if (event->attr.comm) |
| 1897 | atomic_dec(&nr_comm_events); | 1895 | atomic_dec(&nr_comm_events); |
| @@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event) | |||
| 1899 | atomic_dec(&nr_task_events); | 1897 | atomic_dec(&nr_task_events); |
| 1900 | } | 1898 | } |
| 1901 | 1899 | ||
| 1902 | if (event->data) { | 1900 | if (event->buffer) { |
| 1903 | perf_mmap_data_put(event->data); | 1901 | perf_buffer_put(event->buffer); |
| 1904 | event->data = NULL; | 1902 | event->buffer = NULL; |
| 1905 | } | 1903 | } |
| 1906 | 1904 | ||
| 1907 | if (event->destroy) | 1905 | if (event->destroy) |
| @@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 2126 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 2124 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
| 2127 | { | 2125 | { |
| 2128 | struct perf_event *event = file->private_data; | 2126 | struct perf_event *event = file->private_data; |
| 2129 | struct perf_mmap_data *data; | 2127 | struct perf_buffer *buffer; |
| 2130 | unsigned int events = POLL_HUP; | 2128 | unsigned int events = POLL_HUP; |
| 2131 | 2129 | ||
| 2132 | rcu_read_lock(); | 2130 | rcu_read_lock(); |
| 2133 | data = rcu_dereference(event->data); | 2131 | buffer = rcu_dereference(event->buffer); |
| 2134 | if (data) | 2132 | if (buffer) |
| 2135 | events = atomic_xchg(&data->poll, 0); | 2133 | events = atomic_xchg(&buffer->poll, 0); |
| 2136 | rcu_read_unlock(); | 2134 | rcu_read_unlock(); |
| 2137 | 2135 | ||
| 2138 | poll_wait(file, &event->waitq, wait); | 2136 | poll_wait(file, &event->waitq, wait); |
| @@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
| 2143 | static void perf_event_reset(struct perf_event *event) | 2141 | static void perf_event_reset(struct perf_event *event) |
| 2144 | { | 2142 | { |
| 2145 | (void)perf_event_read(event); | 2143 | (void)perf_event_read(event); |
| 2146 | atomic64_set(&event->count, 0); | 2144 | local64_set(&event->count, 0); |
| 2147 | perf_event_update_userpage(event); | 2145 | perf_event_update_userpage(event); |
| 2148 | } | 2146 | } |
| 2149 | 2147 | ||
| @@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event) | |||
| 2342 | void perf_event_update_userpage(struct perf_event *event) | 2340 | void perf_event_update_userpage(struct perf_event *event) |
| 2343 | { | 2341 | { |
| 2344 | struct perf_event_mmap_page *userpg; | 2342 | struct perf_event_mmap_page *userpg; |
| 2345 | struct perf_mmap_data *data; | 2343 | struct perf_buffer *buffer; |
| 2346 | 2344 | ||
| 2347 | rcu_read_lock(); | 2345 | rcu_read_lock(); |
| 2348 | data = rcu_dereference(event->data); | 2346 | buffer = rcu_dereference(event->buffer); |
| 2349 | if (!data) | 2347 | if (!buffer) |
| 2350 | goto unlock; | 2348 | goto unlock; |
| 2351 | 2349 | ||
| 2352 | userpg = data->user_page; | 2350 | userpg = buffer->user_page; |
| 2353 | 2351 | ||
| 2354 | /* | 2352 | /* |
| 2355 | * Disable preemption so as to not let the corresponding user-space | 2353 | * Disable preemption so as to not let the corresponding user-space |
| @@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event) | |||
| 2359 | ++userpg->lock; | 2357 | ++userpg->lock; |
| 2360 | barrier(); | 2358 | barrier(); |
| 2361 | userpg->index = perf_event_index(event); | 2359 | userpg->index = perf_event_index(event); |
| 2362 | userpg->offset = atomic64_read(&event->count); | 2360 | userpg->offset = perf_event_count(event); |
| 2363 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2361 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
| 2364 | userpg->offset -= atomic64_read(&event->hw.prev_count); | 2362 | userpg->offset -= local64_read(&event->hw.prev_count); |
| 2365 | 2363 | ||
| 2366 | userpg->time_enabled = event->total_time_enabled + | 2364 | userpg->time_enabled = event->total_time_enabled + |
| 2367 | atomic64_read(&event->child_total_time_enabled); | 2365 | atomic64_read(&event->child_total_time_enabled); |
| @@ -2376,6 +2374,25 @@ unlock: | |||
| 2376 | rcu_read_unlock(); | 2374 | rcu_read_unlock(); |
| 2377 | } | 2375 | } |
| 2378 | 2376 | ||
| 2377 | static unsigned long perf_data_size(struct perf_buffer *buffer); | ||
| 2378 | |||
| 2379 | static void | ||
| 2380 | perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) | ||
| 2381 | { | ||
| 2382 | long max_size = perf_data_size(buffer); | ||
| 2383 | |||
| 2384 | if (watermark) | ||
| 2385 | buffer->watermark = min(max_size, watermark); | ||
| 2386 | |||
| 2387 | if (!buffer->watermark) | ||
| 2388 | buffer->watermark = max_size / 2; | ||
| 2389 | |||
| 2390 | if (flags & PERF_BUFFER_WRITABLE) | ||
| 2391 | buffer->writable = 1; | ||
| 2392 | |||
| 2393 | atomic_set(&buffer->refcount, 1); | ||
| 2394 | } | ||
| 2395 | |||
| 2379 | #ifndef CONFIG_PERF_USE_VMALLOC | 2396 | #ifndef CONFIG_PERF_USE_VMALLOC |
| 2380 | 2397 | ||
| 2381 | /* | 2398 | /* |
| @@ -2383,15 +2400,15 @@ unlock: | |||
| 2383 | */ | 2400 | */ |
| 2384 | 2401 | ||
| 2385 | static struct page * | 2402 | static struct page * |
| 2386 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2403 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
| 2387 | { | 2404 | { |
| 2388 | if (pgoff > data->nr_pages) | 2405 | if (pgoff > buffer->nr_pages) |
| 2389 | return NULL; | 2406 | return NULL; |
| 2390 | 2407 | ||
| 2391 | if (pgoff == 0) | 2408 | if (pgoff == 0) |
| 2392 | return virt_to_page(data->user_page); | 2409 | return virt_to_page(buffer->user_page); |
| 2393 | 2410 | ||
| 2394 | return virt_to_page(data->data_pages[pgoff - 1]); | 2411 | return virt_to_page(buffer->data_pages[pgoff - 1]); |
| 2395 | } | 2412 | } |
| 2396 | 2413 | ||
| 2397 | static void *perf_mmap_alloc_page(int cpu) | 2414 | static void *perf_mmap_alloc_page(int cpu) |
| @@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu) | |||
| 2407 | return page_address(page); | 2424 | return page_address(page); |
| 2408 | } | 2425 | } |
| 2409 | 2426 | ||
| 2410 | static struct perf_mmap_data * | 2427 | static struct perf_buffer * |
| 2411 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2428 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
| 2412 | { | 2429 | { |
| 2413 | struct perf_mmap_data *data; | 2430 | struct perf_buffer *buffer; |
| 2414 | unsigned long size; | 2431 | unsigned long size; |
| 2415 | int i; | 2432 | int i; |
| 2416 | 2433 | ||
| 2417 | size = sizeof(struct perf_mmap_data); | 2434 | size = sizeof(struct perf_buffer); |
| 2418 | size += nr_pages * sizeof(void *); | 2435 | size += nr_pages * sizeof(void *); |
| 2419 | 2436 | ||
| 2420 | data = kzalloc(size, GFP_KERNEL); | 2437 | buffer = kzalloc(size, GFP_KERNEL); |
| 2421 | if (!data) | 2438 | if (!buffer) |
| 2422 | goto fail; | 2439 | goto fail; |
| 2423 | 2440 | ||
| 2424 | data->user_page = perf_mmap_alloc_page(event->cpu); | 2441 | buffer->user_page = perf_mmap_alloc_page(cpu); |
| 2425 | if (!data->user_page) | 2442 | if (!buffer->user_page) |
| 2426 | goto fail_user_page; | 2443 | goto fail_user_page; |
| 2427 | 2444 | ||
| 2428 | for (i = 0; i < nr_pages; i++) { | 2445 | for (i = 0; i < nr_pages; i++) { |
| 2429 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); | 2446 | buffer->data_pages[i] = perf_mmap_alloc_page(cpu); |
| 2430 | if (!data->data_pages[i]) | 2447 | if (!buffer->data_pages[i]) |
| 2431 | goto fail_data_pages; | 2448 | goto fail_data_pages; |
| 2432 | } | 2449 | } |
| 2433 | 2450 | ||
| 2434 | data->nr_pages = nr_pages; | 2451 | buffer->nr_pages = nr_pages; |
| 2452 | |||
| 2453 | perf_buffer_init(buffer, watermark, flags); | ||
| 2435 | 2454 | ||
| 2436 | return data; | 2455 | return buffer; |
| 2437 | 2456 | ||
| 2438 | fail_data_pages: | 2457 | fail_data_pages: |
| 2439 | for (i--; i >= 0; i--) | 2458 | for (i--; i >= 0; i--) |
| 2440 | free_page((unsigned long)data->data_pages[i]); | 2459 | free_page((unsigned long)buffer->data_pages[i]); |
| 2441 | 2460 | ||
| 2442 | free_page((unsigned long)data->user_page); | 2461 | free_page((unsigned long)buffer->user_page); |
| 2443 | 2462 | ||
| 2444 | fail_user_page: | 2463 | fail_user_page: |
| 2445 | kfree(data); | 2464 | kfree(buffer); |
| 2446 | 2465 | ||
| 2447 | fail: | 2466 | fail: |
| 2448 | return NULL; | 2467 | return NULL; |
| @@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr) | |||
| 2456 | __free_page(page); | 2475 | __free_page(page); |
| 2457 | } | 2476 | } |
| 2458 | 2477 | ||
| 2459 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2478 | static void perf_buffer_free(struct perf_buffer *buffer) |
| 2460 | { | 2479 | { |
| 2461 | int i; | 2480 | int i; |
| 2462 | 2481 | ||
| 2463 | perf_mmap_free_page((unsigned long)data->user_page); | 2482 | perf_mmap_free_page((unsigned long)buffer->user_page); |
| 2464 | for (i = 0; i < data->nr_pages; i++) | 2483 | for (i = 0; i < buffer->nr_pages; i++) |
| 2465 | perf_mmap_free_page((unsigned long)data->data_pages[i]); | 2484 | perf_mmap_free_page((unsigned long)buffer->data_pages[i]); |
| 2466 | kfree(data); | 2485 | kfree(buffer); |
| 2467 | } | 2486 | } |
| 2468 | 2487 | ||
| 2469 | static inline int page_order(struct perf_mmap_data *data) | 2488 | static inline int page_order(struct perf_buffer *buffer) |
| 2470 | { | 2489 | { |
| 2471 | return 0; | 2490 | return 0; |
| 2472 | } | 2491 | } |
| @@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data) | |||
| 2479 | * Required for architectures that have d-cache aliasing issues. | 2498 | * Required for architectures that have d-cache aliasing issues. |
| 2480 | */ | 2499 | */ |
| 2481 | 2500 | ||
| 2482 | static inline int page_order(struct perf_mmap_data *data) | 2501 | static inline int page_order(struct perf_buffer *buffer) |
| 2483 | { | 2502 | { |
| 2484 | return data->page_order; | 2503 | return buffer->page_order; |
| 2485 | } | 2504 | } |
| 2486 | 2505 | ||
| 2487 | static struct page * | 2506 | static struct page * |
| 2488 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) | 2507 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) |
| 2489 | { | 2508 | { |
| 2490 | if (pgoff > (1UL << page_order(data))) | 2509 | if (pgoff > (1UL << page_order(buffer))) |
| 2491 | return NULL; | 2510 | return NULL; |
| 2492 | 2511 | ||
| 2493 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); | 2512 | return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); |
| 2494 | } | 2513 | } |
| 2495 | 2514 | ||
| 2496 | static void perf_mmap_unmark_page(void *addr) | 2515 | static void perf_mmap_unmark_page(void *addr) |
| @@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr) | |||
| 2500 | page->mapping = NULL; | 2519 | page->mapping = NULL; |
| 2501 | } | 2520 | } |
| 2502 | 2521 | ||
| 2503 | static void perf_mmap_data_free_work(struct work_struct *work) | 2522 | static void perf_buffer_free_work(struct work_struct *work) |
| 2504 | { | 2523 | { |
| 2505 | struct perf_mmap_data *data; | 2524 | struct perf_buffer *buffer; |
| 2506 | void *base; | 2525 | void *base; |
| 2507 | int i, nr; | 2526 | int i, nr; |
| 2508 | 2527 | ||
| 2509 | data = container_of(work, struct perf_mmap_data, work); | 2528 | buffer = container_of(work, struct perf_buffer, work); |
| 2510 | nr = 1 << page_order(data); | 2529 | nr = 1 << page_order(buffer); |
| 2511 | 2530 | ||
| 2512 | base = data->user_page; | 2531 | base = buffer->user_page; |
| 2513 | for (i = 0; i < nr + 1; i++) | 2532 | for (i = 0; i < nr + 1; i++) |
| 2514 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | 2533 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); |
| 2515 | 2534 | ||
| 2516 | vfree(base); | 2535 | vfree(base); |
| 2517 | kfree(data); | 2536 | kfree(buffer); |
| 2518 | } | 2537 | } |
| 2519 | 2538 | ||
| 2520 | static void perf_mmap_data_free(struct perf_mmap_data *data) | 2539 | static void perf_buffer_free(struct perf_buffer *buffer) |
| 2521 | { | 2540 | { |
| 2522 | schedule_work(&data->work); | 2541 | schedule_work(&buffer->work); |
| 2523 | } | 2542 | } |
| 2524 | 2543 | ||
| 2525 | static struct perf_mmap_data * | 2544 | static struct perf_buffer * |
| 2526 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) | 2545 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) |
| 2527 | { | 2546 | { |
| 2528 | struct perf_mmap_data *data; | 2547 | struct perf_buffer *buffer; |
| 2529 | unsigned long size; | 2548 | unsigned long size; |
| 2530 | void *all_buf; | 2549 | void *all_buf; |
| 2531 | 2550 | ||
| 2532 | size = sizeof(struct perf_mmap_data); | 2551 | size = sizeof(struct perf_buffer); |
| 2533 | size += sizeof(void *); | 2552 | size += sizeof(void *); |
| 2534 | 2553 | ||
| 2535 | data = kzalloc(size, GFP_KERNEL); | 2554 | buffer = kzalloc(size, GFP_KERNEL); |
| 2536 | if (!data) | 2555 | if (!buffer) |
| 2537 | goto fail; | 2556 | goto fail; |
| 2538 | 2557 | ||
| 2539 | INIT_WORK(&data->work, perf_mmap_data_free_work); | 2558 | INIT_WORK(&buffer->work, perf_buffer_free_work); |
| 2540 | 2559 | ||
| 2541 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | 2560 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); |
| 2542 | if (!all_buf) | 2561 | if (!all_buf) |
| 2543 | goto fail_all_buf; | 2562 | goto fail_all_buf; |
| 2544 | 2563 | ||
| 2545 | data->user_page = all_buf; | 2564 | buffer->user_page = all_buf; |
| 2546 | data->data_pages[0] = all_buf + PAGE_SIZE; | 2565 | buffer->data_pages[0] = all_buf + PAGE_SIZE; |
| 2547 | data->page_order = ilog2(nr_pages); | 2566 | buffer->page_order = ilog2(nr_pages); |
| 2548 | data->nr_pages = 1; | 2567 | buffer->nr_pages = 1; |
| 2568 | |||
| 2569 | perf_buffer_init(buffer, watermark, flags); | ||
| 2549 | 2570 | ||
| 2550 | return data; | 2571 | return buffer; |
| 2551 | 2572 | ||
| 2552 | fail_all_buf: | 2573 | fail_all_buf: |
| 2553 | kfree(data); | 2574 | kfree(buffer); |
| 2554 | 2575 | ||
| 2555 | fail: | 2576 | fail: |
| 2556 | return NULL; | 2577 | return NULL; |
| @@ -2558,15 +2579,15 @@ fail: | |||
| 2558 | 2579 | ||
| 2559 | #endif | 2580 | #endif |
| 2560 | 2581 | ||
| 2561 | static unsigned long perf_data_size(struct perf_mmap_data *data) | 2582 | static unsigned long perf_data_size(struct perf_buffer *buffer) |
| 2562 | { | 2583 | { |
| 2563 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); | 2584 | return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); |
| 2564 | } | 2585 | } |
| 2565 | 2586 | ||
| 2566 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 2587 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 2567 | { | 2588 | { |
| 2568 | struct perf_event *event = vma->vm_file->private_data; | 2589 | struct perf_event *event = vma->vm_file->private_data; |
| 2569 | struct perf_mmap_data *data; | 2590 | struct perf_buffer *buffer; |
| 2570 | int ret = VM_FAULT_SIGBUS; | 2591 | int ret = VM_FAULT_SIGBUS; |
| 2571 | 2592 | ||
| 2572 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | 2593 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
| @@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 2576 | } | 2597 | } |
| 2577 | 2598 | ||
| 2578 | rcu_read_lock(); | 2599 | rcu_read_lock(); |
| 2579 | data = rcu_dereference(event->data); | 2600 | buffer = rcu_dereference(event->buffer); |
| 2580 | if (!data) | 2601 | if (!buffer) |
| 2581 | goto unlock; | 2602 | goto unlock; |
| 2582 | 2603 | ||
| 2583 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) | 2604 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
| 2584 | goto unlock; | 2605 | goto unlock; |
| 2585 | 2606 | ||
| 2586 | vmf->page = perf_mmap_to_page(data, vmf->pgoff); | 2607 | vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); |
| 2587 | if (!vmf->page) | 2608 | if (!vmf->page) |
| 2588 | goto unlock; | 2609 | goto unlock; |
| 2589 | 2610 | ||
| @@ -2598,52 +2619,35 @@ unlock: | |||
| 2598 | return ret; | 2619 | return ret; |
| 2599 | } | 2620 | } |
| 2600 | 2621 | ||
| 2601 | static void | 2622 | static void perf_buffer_free_rcu(struct rcu_head *rcu_head) |
| 2602 | perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) | ||
| 2603 | { | ||
| 2604 | long max_size = perf_data_size(data); | ||
| 2605 | |||
| 2606 | if (event->attr.watermark) { | ||
| 2607 | data->watermark = min_t(long, max_size, | ||
| 2608 | event->attr.wakeup_watermark); | ||
| 2609 | } | ||
| 2610 | |||
| 2611 | if (!data->watermark) | ||
| 2612 | data->watermark = max_size / 2; | ||
| 2613 | |||
| 2614 | atomic_set(&data->refcount, 1); | ||
| 2615 | rcu_assign_pointer(event->data, data); | ||
| 2616 | } | ||
| 2617 | |||
| 2618 | static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) | ||
| 2619 | { | 2623 | { |
| 2620 | struct perf_mmap_data *data; | 2624 | struct perf_buffer *buffer; |
| 2621 | 2625 | ||
| 2622 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); | 2626 | buffer = container_of(rcu_head, struct perf_buffer, rcu_head); |
| 2623 | perf_mmap_data_free(data); | 2627 | perf_buffer_free(buffer); |
| 2624 | } | 2628 | } |
| 2625 | 2629 | ||
| 2626 | static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) | 2630 | static struct perf_buffer *perf_buffer_get(struct perf_event *event) |
| 2627 | { | 2631 | { |
| 2628 | struct perf_mmap_data *data; | 2632 | struct perf_buffer *buffer; |
| 2629 | 2633 | ||
| 2630 | rcu_read_lock(); | 2634 | rcu_read_lock(); |
| 2631 | data = rcu_dereference(event->data); | 2635 | buffer = rcu_dereference(event->buffer); |
| 2632 | if (data) { | 2636 | if (buffer) { |
| 2633 | if (!atomic_inc_not_zero(&data->refcount)) | 2637 | if (!atomic_inc_not_zero(&buffer->refcount)) |
| 2634 | data = NULL; | 2638 | buffer = NULL; |
| 2635 | } | 2639 | } |
| 2636 | rcu_read_unlock(); | 2640 | rcu_read_unlock(); |
| 2637 | 2641 | ||
| 2638 | return data; | 2642 | return buffer; |
| 2639 | } | 2643 | } |
| 2640 | 2644 | ||
| 2641 | static void perf_mmap_data_put(struct perf_mmap_data *data) | 2645 | static void perf_buffer_put(struct perf_buffer *buffer) |
| 2642 | { | 2646 | { |
| 2643 | if (!atomic_dec_and_test(&data->refcount)) | 2647 | if (!atomic_dec_and_test(&buffer->refcount)) |
| 2644 | return; | 2648 | return; |
| 2645 | 2649 | ||
| 2646 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); | 2650 | call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); |
| 2647 | } | 2651 | } |
| 2648 | 2652 | ||
| 2649 | static void perf_mmap_open(struct vm_area_struct *vma) | 2653 | static void perf_mmap_open(struct vm_area_struct *vma) |
| @@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 2658 | struct perf_event *event = vma->vm_file->private_data; | 2662 | struct perf_event *event = vma->vm_file->private_data; |
| 2659 | 2663 | ||
| 2660 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 2664 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
| 2661 | unsigned long size = perf_data_size(event->data); | 2665 | unsigned long size = perf_data_size(event->buffer); |
| 2662 | struct user_struct *user = event->mmap_user; | 2666 | struct user_struct *user = event->mmap_user; |
| 2663 | struct perf_mmap_data *data = event->data; | 2667 | struct perf_buffer *buffer = event->buffer; |
| 2664 | 2668 | ||
| 2665 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 2669 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
| 2666 | vma->vm_mm->locked_vm -= event->mmap_locked; | 2670 | vma->vm_mm->locked_vm -= event->mmap_locked; |
| 2667 | rcu_assign_pointer(event->data, NULL); | 2671 | rcu_assign_pointer(event->buffer, NULL); |
| 2668 | mutex_unlock(&event->mmap_mutex); | 2672 | mutex_unlock(&event->mmap_mutex); |
| 2669 | 2673 | ||
| 2670 | perf_mmap_data_put(data); | 2674 | perf_buffer_put(buffer); |
| 2671 | free_uid(user); | 2675 | free_uid(user); |
| 2672 | } | 2676 | } |
| 2673 | } | 2677 | } |
| @@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 2685 | unsigned long user_locked, user_lock_limit; | 2689 | unsigned long user_locked, user_lock_limit; |
| 2686 | struct user_struct *user = current_user(); | 2690 | struct user_struct *user = current_user(); |
| 2687 | unsigned long locked, lock_limit; | 2691 | unsigned long locked, lock_limit; |
| 2688 | struct perf_mmap_data *data; | 2692 | struct perf_buffer *buffer; |
| 2689 | unsigned long vma_size; | 2693 | unsigned long vma_size; |
| 2690 | unsigned long nr_pages; | 2694 | unsigned long nr_pages; |
| 2691 | long user_extra, extra; | 2695 | long user_extra, extra; |
| 2692 | int ret = 0; | 2696 | int ret = 0, flags = 0; |
| 2693 | 2697 | ||
| 2694 | /* | 2698 | /* |
| 2695 | * Don't allow mmap() of inherited per-task counters. This would | 2699 | * Don't allow mmap() of inherited per-task counters. This would |
| @@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 2706 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 2710 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
| 2707 | 2711 | ||
| 2708 | /* | 2712 | /* |
| 2709 | * If we have data pages ensure they're a power-of-two number, so we | 2713 | * If we have buffer pages ensure they're a power-of-two number, so we |
| 2710 | * can do bitmasks instead of modulo. | 2714 | * can do bitmasks instead of modulo. |
| 2711 | */ | 2715 | */ |
| 2712 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | 2716 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
| @@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 2720 | 2724 | ||
| 2721 | WARN_ON_ONCE(event->ctx->parent_ctx); | 2725 | WARN_ON_ONCE(event->ctx->parent_ctx); |
| 2722 | mutex_lock(&event->mmap_mutex); | 2726 | mutex_lock(&event->mmap_mutex); |
| 2723 | if (event->data) { | 2727 | if (event->buffer) { |
| 2724 | if (event->data->nr_pages == nr_pages) | 2728 | if (event->buffer->nr_pages == nr_pages) |
| 2725 | atomic_inc(&event->data->refcount); | 2729 | atomic_inc(&event->buffer->refcount); |
| 2726 | else | 2730 | else |
| 2727 | ret = -EINVAL; | 2731 | ret = -EINVAL; |
| 2728 | goto unlock; | 2732 | goto unlock; |
| @@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 2752 | goto unlock; | 2756 | goto unlock; |
| 2753 | } | 2757 | } |
| 2754 | 2758 | ||
| 2755 | WARN_ON(event->data); | 2759 | WARN_ON(event->buffer); |
| 2760 | |||
| 2761 | if (vma->vm_flags & VM_WRITE) | ||
| 2762 | flags |= PERF_BUFFER_WRITABLE; | ||
| 2756 | 2763 | ||
| 2757 | data = perf_mmap_data_alloc(event, nr_pages); | 2764 | buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, |
| 2758 | if (!data) { | 2765 | event->cpu, flags); |
| 2766 | if (!buffer) { | ||
| 2759 | ret = -ENOMEM; | 2767 | ret = -ENOMEM; |
| 2760 | goto unlock; | 2768 | goto unlock; |
| 2761 | } | 2769 | } |
| 2762 | 2770 | rcu_assign_pointer(event->buffer, buffer); | |
| 2763 | perf_mmap_data_init(event, data); | ||
| 2764 | if (vma->vm_flags & VM_WRITE) | ||
| 2765 | event->data->writable = 1; | ||
| 2766 | 2771 | ||
| 2767 | atomic_long_add(user_extra, &user->locked_vm); | 2772 | atomic_long_add(user_extra, &user->locked_vm); |
| 2768 | event->mmap_locked = extra; | 2773 | event->mmap_locked = extra; |
| @@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | |||
| 2941 | return NULL; | 2946 | return NULL; |
| 2942 | } | 2947 | } |
| 2943 | 2948 | ||
| 2944 | __weak | ||
| 2945 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) | ||
| 2946 | { | ||
| 2947 | } | ||
| 2948 | |||
| 2949 | 2949 | ||
| 2950 | /* | 2950 | /* |
| 2951 | * We assume there is only KVM supporting the callbacks. | 2951 | * We assume there is only KVM supporting the callbacks. |
| @@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | |||
| 2971 | /* | 2971 | /* |
| 2972 | * Output | 2972 | * Output |
| 2973 | */ | 2973 | */ |
| 2974 | static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | 2974 | static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, |
| 2975 | unsigned long offset, unsigned long head) | 2975 | unsigned long offset, unsigned long head) |
| 2976 | { | 2976 | { |
| 2977 | unsigned long mask; | 2977 | unsigned long mask; |
| 2978 | 2978 | ||
| 2979 | if (!data->writable) | 2979 | if (!buffer->writable) |
| 2980 | return true; | 2980 | return true; |
| 2981 | 2981 | ||
| 2982 | mask = perf_data_size(data) - 1; | 2982 | mask = perf_data_size(buffer) - 1; |
| 2983 | 2983 | ||
| 2984 | offset = (offset - tail) & mask; | 2984 | offset = (offset - tail) & mask; |
| 2985 | head = (head - tail) & mask; | 2985 | head = (head - tail) & mask; |
| @@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, | |||
| 2992 | 2992 | ||
| 2993 | static void perf_output_wakeup(struct perf_output_handle *handle) | 2993 | static void perf_output_wakeup(struct perf_output_handle *handle) |
| 2994 | { | 2994 | { |
| 2995 | atomic_set(&handle->data->poll, POLL_IN); | 2995 | atomic_set(&handle->buffer->poll, POLL_IN); |
| 2996 | 2996 | ||
| 2997 | if (handle->nmi) { | 2997 | if (handle->nmi) { |
| 2998 | handle->event->pending_wakeup = 1; | 2998 | handle->event->pending_wakeup = 1; |
| @@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
| 3012 | */ | 3012 | */ |
| 3013 | static void perf_output_get_handle(struct perf_output_handle *handle) | 3013 | static void perf_output_get_handle(struct perf_output_handle *handle) |
| 3014 | { | 3014 | { |
| 3015 | struct perf_mmap_data *data = handle->data; | 3015 | struct perf_buffer *buffer = handle->buffer; |
| 3016 | 3016 | ||
| 3017 | preempt_disable(); | 3017 | preempt_disable(); |
| 3018 | local_inc(&data->nest); | 3018 | local_inc(&buffer->nest); |
| 3019 | handle->wakeup = local_read(&data->wakeup); | 3019 | handle->wakeup = local_read(&buffer->wakeup); |
| 3020 | } | 3020 | } |
| 3021 | 3021 | ||
| 3022 | static void perf_output_put_handle(struct perf_output_handle *handle) | 3022 | static void perf_output_put_handle(struct perf_output_handle *handle) |
| 3023 | { | 3023 | { |
| 3024 | struct perf_mmap_data *data = handle->data; | 3024 | struct perf_buffer *buffer = handle->buffer; |
| 3025 | unsigned long head; | 3025 | unsigned long head; |
| 3026 | 3026 | ||
| 3027 | again: | 3027 | again: |
| 3028 | head = local_read(&data->head); | 3028 | head = local_read(&buffer->head); |
| 3029 | 3029 | ||
| 3030 | /* | 3030 | /* |
| 3031 | * IRQ/NMI can happen here, which means we can miss a head update. | 3031 | * IRQ/NMI can happen here, which means we can miss a head update. |
| 3032 | */ | 3032 | */ |
| 3033 | 3033 | ||
| 3034 | if (!local_dec_and_test(&data->nest)) | 3034 | if (!local_dec_and_test(&buffer->nest)) |
| 3035 | goto out; | 3035 | goto out; |
| 3036 | 3036 | ||
| 3037 | /* | 3037 | /* |
| 3038 | * Publish the known good head. Rely on the full barrier implied | 3038 | * Publish the known good head. Rely on the full barrier implied |
| 3039 | * by atomic_dec_and_test() order the data->head read and this | 3039 | * by atomic_dec_and_test() order the buffer->head read and this |
| 3040 | * write. | 3040 | * write. |
| 3041 | */ | 3041 | */ |
| 3042 | data->user_page->data_head = head; | 3042 | buffer->user_page->data_head = head; |
| 3043 | 3043 | ||
| 3044 | /* | 3044 | /* |
| 3045 | * Now check if we missed an update, rely on the (compiler) | 3045 | * Now check if we missed an update, rely on the (compiler) |
| 3046 | * barrier in atomic_dec_and_test() to re-read data->head. | 3046 | * barrier in atomic_dec_and_test() to re-read buffer->head. |
| 3047 | */ | 3047 | */ |
| 3048 | if (unlikely(head != local_read(&data->head))) { | 3048 | if (unlikely(head != local_read(&buffer->head))) { |
| 3049 | local_inc(&data->nest); | 3049 | local_inc(&buffer->nest); |
| 3050 | goto again; | 3050 | goto again; |
| 3051 | } | 3051 | } |
| 3052 | 3052 | ||
| 3053 | if (handle->wakeup != local_read(&data->wakeup)) | 3053 | if (handle->wakeup != local_read(&buffer->wakeup)) |
| 3054 | perf_output_wakeup(handle); | 3054 | perf_output_wakeup(handle); |
| 3055 | 3055 | ||
| 3056 | out: | 3056 | out: |
| @@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, | |||
| 3070 | buf += size; | 3070 | buf += size; |
| 3071 | handle->size -= size; | 3071 | handle->size -= size; |
| 3072 | if (!handle->size) { | 3072 | if (!handle->size) { |
| 3073 | struct perf_mmap_data *data = handle->data; | 3073 | struct perf_buffer *buffer = handle->buffer; |
| 3074 | 3074 | ||
| 3075 | handle->page++; | 3075 | handle->page++; |
| 3076 | handle->page &= data->nr_pages - 1; | 3076 | handle->page &= buffer->nr_pages - 1; |
| 3077 | handle->addr = data->data_pages[handle->page]; | 3077 | handle->addr = buffer->data_pages[handle->page]; |
| 3078 | handle->size = PAGE_SIZE << page_order(data); | 3078 | handle->size = PAGE_SIZE << page_order(buffer); |
| 3079 | } | 3079 | } |
| 3080 | } while (len); | 3080 | } while (len); |
| 3081 | } | 3081 | } |
| @@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3084 | struct perf_event *event, unsigned int size, | 3084 | struct perf_event *event, unsigned int size, |
| 3085 | int nmi, int sample) | 3085 | int nmi, int sample) |
| 3086 | { | 3086 | { |
| 3087 | struct perf_mmap_data *data; | 3087 | struct perf_buffer *buffer; |
| 3088 | unsigned long tail, offset, head; | 3088 | unsigned long tail, offset, head; |
| 3089 | int have_lost; | 3089 | int have_lost; |
| 3090 | struct { | 3090 | struct { |
| @@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3100 | if (event->parent) | 3100 | if (event->parent) |
| 3101 | event = event->parent; | 3101 | event = event->parent; |
| 3102 | 3102 | ||
| 3103 | data = rcu_dereference(event->data); | 3103 | buffer = rcu_dereference(event->buffer); |
| 3104 | if (!data) | 3104 | if (!buffer) |
| 3105 | goto out; | 3105 | goto out; |
| 3106 | 3106 | ||
| 3107 | handle->data = data; | 3107 | handle->buffer = buffer; |
| 3108 | handle->event = event; | 3108 | handle->event = event; |
| 3109 | handle->nmi = nmi; | 3109 | handle->nmi = nmi; |
| 3110 | handle->sample = sample; | 3110 | handle->sample = sample; |
| 3111 | 3111 | ||
| 3112 | if (!data->nr_pages) | 3112 | if (!buffer->nr_pages) |
| 3113 | goto out; | 3113 | goto out; |
| 3114 | 3114 | ||
| 3115 | have_lost = local_read(&data->lost); | 3115 | have_lost = local_read(&buffer->lost); |
| 3116 | if (have_lost) | 3116 | if (have_lost) |
| 3117 | size += sizeof(lost_event); | 3117 | size += sizeof(lost_event); |
| 3118 | 3118 | ||
| @@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3124 | * tail pointer. So that all reads will be completed before the | 3124 | * tail pointer. So that all reads will be completed before the |
| 3125 | * write is issued. | 3125 | * write is issued. |
| 3126 | */ | 3126 | */ |
| 3127 | tail = ACCESS_ONCE(data->user_page->data_tail); | 3127 | tail = ACCESS_ONCE(buffer->user_page->data_tail); |
| 3128 | smp_rmb(); | 3128 | smp_rmb(); |
| 3129 | offset = head = local_read(&data->head); | 3129 | offset = head = local_read(&buffer->head); |
| 3130 | head += size; | 3130 | head += size; |
| 3131 | if (unlikely(!perf_output_space(data, tail, offset, head))) | 3131 | if (unlikely(!perf_output_space(buffer, tail, offset, head))) |
| 3132 | goto fail; | 3132 | goto fail; |
| 3133 | } while (local_cmpxchg(&data->head, offset, head) != offset); | 3133 | } while (local_cmpxchg(&buffer->head, offset, head) != offset); |
| 3134 | 3134 | ||
| 3135 | if (head - local_read(&data->wakeup) > data->watermark) | 3135 | if (head - local_read(&buffer->wakeup) > buffer->watermark) |
| 3136 | local_add(data->watermark, &data->wakeup); | 3136 | local_add(buffer->watermark, &buffer->wakeup); |
| 3137 | 3137 | ||
| 3138 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); | 3138 | handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); |
| 3139 | handle->page &= data->nr_pages - 1; | 3139 | handle->page &= buffer->nr_pages - 1; |
| 3140 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); | 3140 | handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); |
| 3141 | handle->addr = data->data_pages[handle->page]; | 3141 | handle->addr = buffer->data_pages[handle->page]; |
| 3142 | handle->addr += handle->size; | 3142 | handle->addr += handle->size; |
| 3143 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; | 3143 | handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; |
| 3144 | 3144 | ||
| 3145 | if (have_lost) { | 3145 | if (have_lost) { |
| 3146 | lost_event.header.type = PERF_RECORD_LOST; | 3146 | lost_event.header.type = PERF_RECORD_LOST; |
| 3147 | lost_event.header.misc = 0; | 3147 | lost_event.header.misc = 0; |
| 3148 | lost_event.header.size = sizeof(lost_event); | 3148 | lost_event.header.size = sizeof(lost_event); |
| 3149 | lost_event.id = event->id; | 3149 | lost_event.id = event->id; |
| 3150 | lost_event.lost = local_xchg(&data->lost, 0); | 3150 | lost_event.lost = local_xchg(&buffer->lost, 0); |
| 3151 | 3151 | ||
| 3152 | perf_output_put(handle, lost_event); | 3152 | perf_output_put(handle, lost_event); |
| 3153 | } | 3153 | } |
| @@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 3155 | return 0; | 3155 | return 0; |
| 3156 | 3156 | ||
| 3157 | fail: | 3157 | fail: |
| 3158 | local_inc(&data->lost); | 3158 | local_inc(&buffer->lost); |
| 3159 | perf_output_put_handle(handle); | 3159 | perf_output_put_handle(handle); |
| 3160 | out: | 3160 | out: |
| 3161 | rcu_read_unlock(); | 3161 | rcu_read_unlock(); |
| @@ -3166,15 +3166,15 @@ out: | |||
| 3166 | void perf_output_end(struct perf_output_handle *handle) | 3166 | void perf_output_end(struct perf_output_handle *handle) |
| 3167 | { | 3167 | { |
| 3168 | struct perf_event *event = handle->event; | 3168 | struct perf_event *event = handle->event; |
| 3169 | struct perf_mmap_data *data = handle->data; | 3169 | struct perf_buffer *buffer = handle->buffer; |
| 3170 | 3170 | ||
| 3171 | int wakeup_events = event->attr.wakeup_events; | 3171 | int wakeup_events = event->attr.wakeup_events; |
| 3172 | 3172 | ||
| 3173 | if (handle->sample && wakeup_events) { | 3173 | if (handle->sample && wakeup_events) { |
| 3174 | int events = local_inc_return(&data->events); | 3174 | int events = local_inc_return(&buffer->events); |
| 3175 | if (events >= wakeup_events) { | 3175 | if (events >= wakeup_events) { |
| 3176 | local_sub(wakeup_events, &data->events); | 3176 | local_sub(wakeup_events, &buffer->events); |
| 3177 | local_inc(&data->wakeup); | 3177 | local_inc(&buffer->wakeup); |
| 3178 | } | 3178 | } |
| 3179 | } | 3179 | } |
| 3180 | 3180 | ||
| @@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
| 3211 | u64 values[4]; | 3211 | u64 values[4]; |
| 3212 | int n = 0; | 3212 | int n = 0; |
| 3213 | 3213 | ||
| 3214 | values[n++] = atomic64_read(&event->count); | 3214 | values[n++] = perf_event_count(event); |
| 3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | 3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
| 3216 | values[n++] = event->total_time_enabled + | 3216 | values[n++] = event->total_time_enabled + |
| 3217 | atomic64_read(&event->child_total_time_enabled); | 3217 | atomic64_read(&event->child_total_time_enabled); |
| @@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 3248 | if (leader != event) | 3248 | if (leader != event) |
| 3249 | leader->pmu->read(leader); | 3249 | leader->pmu->read(leader); |
| 3250 | 3250 | ||
| 3251 | values[n++] = atomic64_read(&leader->count); | 3251 | values[n++] = perf_event_count(leader); |
| 3252 | if (read_format & PERF_FORMAT_ID) | 3252 | if (read_format & PERF_FORMAT_ID) |
| 3253 | values[n++] = primary_event_id(leader); | 3253 | values[n++] = primary_event_id(leader); |
| 3254 | 3254 | ||
| @@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 3260 | if (sub != event) | 3260 | if (sub != event) |
| 3261 | sub->pmu->read(sub); | 3261 | sub->pmu->read(sub); |
| 3262 | 3262 | ||
| 3263 | values[n++] = atomic64_read(&sub->count); | 3263 | values[n++] = perf_event_count(sub); |
| 3264 | if (read_format & PERF_FORMAT_ID) | 3264 | if (read_format & PERF_FORMAT_ID) |
| 3265 | values[n++] = primary_event_id(sub); | 3265 | values[n++] = primary_event_id(sub); |
| 3266 | 3266 | ||
| @@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event, | |||
| 3491 | /* | 3491 | /* |
| 3492 | * task tracking -- fork/exit | 3492 | * task tracking -- fork/exit |
| 3493 | * | 3493 | * |
| 3494 | * enabled by: attr.comm | attr.mmap | attr.task | 3494 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task |
| 3495 | */ | 3495 | */ |
| 3496 | 3496 | ||
| 3497 | struct perf_task_event { | 3497 | struct perf_task_event { |
| @@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event) | |||
| 3541 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3541 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| 3542 | return 0; | 3542 | return 0; |
| 3543 | 3543 | ||
| 3544 | if (event->attr.comm || event->attr.mmap || event->attr.task) | 3544 | if (event->attr.comm || event->attr.mmap || |
| 3545 | event->attr.mmap_data || event->attr.task) | ||
| 3545 | return 1; | 3546 | return 1; |
| 3546 | 3547 | ||
| 3547 | return 0; | 3548 | return 0; |
| @@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 3766 | } | 3767 | } |
| 3767 | 3768 | ||
| 3768 | static int perf_event_mmap_match(struct perf_event *event, | 3769 | static int perf_event_mmap_match(struct perf_event *event, |
| 3769 | struct perf_mmap_event *mmap_event) | 3770 | struct perf_mmap_event *mmap_event, |
| 3771 | int executable) | ||
| 3770 | { | 3772 | { |
| 3771 | if (event->state < PERF_EVENT_STATE_INACTIVE) | 3773 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
| 3772 | return 0; | 3774 | return 0; |
| @@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event, | |||
| 3774 | if (event->cpu != -1 && event->cpu != smp_processor_id()) | 3776 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
| 3775 | return 0; | 3777 | return 0; |
| 3776 | 3778 | ||
| 3777 | if (event->attr.mmap) | 3779 | if ((!executable && event->attr.mmap_data) || |
| 3780 | (executable && event->attr.mmap)) | ||
| 3778 | return 1; | 3781 | return 1; |
| 3779 | 3782 | ||
| 3780 | return 0; | 3783 | return 0; |
| 3781 | } | 3784 | } |
| 3782 | 3785 | ||
| 3783 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, | 3786 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, |
| 3784 | struct perf_mmap_event *mmap_event) | 3787 | struct perf_mmap_event *mmap_event, |
| 3788 | int executable) | ||
| 3785 | { | 3789 | { |
| 3786 | struct perf_event *event; | 3790 | struct perf_event *event; |
| 3787 | 3791 | ||
| 3788 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 3792 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
| 3789 | if (perf_event_mmap_match(event, mmap_event)) | 3793 | if (perf_event_mmap_match(event, mmap_event, executable)) |
| 3790 | perf_event_mmap_output(event, mmap_event); | 3794 | perf_event_mmap_output(event, mmap_event); |
| 3791 | } | 3795 | } |
| 3792 | } | 3796 | } |
| @@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 3830 | if (!vma->vm_mm) { | 3834 | if (!vma->vm_mm) { |
| 3831 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | 3835 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); |
| 3832 | goto got_name; | 3836 | goto got_name; |
| 3837 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
| 3838 | vma->vm_end >= vma->vm_mm->brk) { | ||
| 3839 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | ||
| 3840 | goto got_name; | ||
| 3841 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | ||
| 3842 | vma->vm_end >= vma->vm_mm->start_stack) { | ||
| 3843 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | ||
| 3844 | goto got_name; | ||
| 3833 | } | 3845 | } |
| 3834 | 3846 | ||
| 3835 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 3847 | name = strncpy(tmp, "//anon", sizeof(tmp)); |
| @@ -3846,17 +3858,17 @@ got_name: | |||
| 3846 | 3858 | ||
| 3847 | rcu_read_lock(); | 3859 | rcu_read_lock(); |
| 3848 | cpuctx = &get_cpu_var(perf_cpu_context); | 3860 | cpuctx = &get_cpu_var(perf_cpu_context); |
| 3849 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); | 3861 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); |
| 3850 | ctx = rcu_dereference(current->perf_event_ctxp); | 3862 | ctx = rcu_dereference(current->perf_event_ctxp); |
| 3851 | if (ctx) | 3863 | if (ctx) |
| 3852 | perf_event_mmap_ctx(ctx, mmap_event); | 3864 | perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); |
| 3853 | put_cpu_var(perf_cpu_context); | 3865 | put_cpu_var(perf_cpu_context); |
| 3854 | rcu_read_unlock(); | 3866 | rcu_read_unlock(); |
| 3855 | 3867 | ||
| 3856 | kfree(buf); | 3868 | kfree(buf); |
| 3857 | } | 3869 | } |
| 3858 | 3870 | ||
| 3859 | void __perf_event_mmap(struct vm_area_struct *vma) | 3871 | void perf_event_mmap(struct vm_area_struct *vma) |
| 3860 | { | 3872 | { |
| 3861 | struct perf_mmap_event mmap_event; | 3873 | struct perf_mmap_event mmap_event; |
| 3862 | 3874 | ||
| @@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) | |||
| 4018 | hwc->last_period = hwc->sample_period; | 4030 | hwc->last_period = hwc->sample_period; |
| 4019 | 4031 | ||
| 4020 | again: | 4032 | again: |
| 4021 | old = val = atomic64_read(&hwc->period_left); | 4033 | old = val = local64_read(&hwc->period_left); |
| 4022 | if (val < 0) | 4034 | if (val < 0) |
| 4023 | return 0; | 4035 | return 0; |
| 4024 | 4036 | ||
| 4025 | nr = div64_u64(period + val, period); | 4037 | nr = div64_u64(period + val, period); |
| 4026 | offset = nr * period; | 4038 | offset = nr * period; |
| 4027 | val -= offset; | 4039 | val -= offset; |
| 4028 | if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) | 4040 | if (local64_cmpxchg(&hwc->period_left, old, val) != old) |
| 4029 | goto again; | 4041 | goto again; |
| 4030 | 4042 | ||
| 4031 | return nr; | 4043 | return nr; |
| @@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
| 4064 | { | 4076 | { |
| 4065 | struct hw_perf_event *hwc = &event->hw; | 4077 | struct hw_perf_event *hwc = &event->hw; |
| 4066 | 4078 | ||
| 4067 | atomic64_add(nr, &event->count); | 4079 | local64_add(nr, &event->count); |
| 4068 | 4080 | ||
| 4069 | if (!regs) | 4081 | if (!regs) |
| 4070 | return; | 4082 | return; |
| @@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, | |||
| 4075 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4087 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
| 4076 | return perf_swevent_overflow(event, 1, nmi, data, regs); | 4088 | return perf_swevent_overflow(event, 1, nmi, data, regs); |
| 4077 | 4089 | ||
| 4078 | if (atomic64_add_negative(nr, &hwc->period_left)) | 4090 | if (local64_add_negative(nr, &hwc->period_left)) |
| 4079 | return; | 4091 | return; |
| 4080 | 4092 | ||
| 4081 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4093 | perf_swevent_overflow(event, 0, nmi, data, regs); |
| @@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void) | |||
| 4213 | } | 4225 | } |
| 4214 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); | 4226 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
| 4215 | 4227 | ||
| 4216 | void perf_swevent_put_recursion_context(int rctx) | 4228 | void inline perf_swevent_put_recursion_context(int rctx) |
| 4217 | { | 4229 | { |
| 4218 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | 4230 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
| 4219 | barrier(); | 4231 | barrier(); |
| 4220 | cpuctx->recursion[rctx]--; | 4232 | cpuctx->recursion[rctx]--; |
| 4221 | } | 4233 | } |
| 4222 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); | ||
| 4223 | |||
| 4224 | 4234 | ||
| 4225 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4235 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
| 4226 | struct pt_regs *regs, u64 addr) | 4236 | struct pt_regs *regs, u64 addr) |
| @@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) | |||
| 4368 | u64 now; | 4378 | u64 now; |
| 4369 | 4379 | ||
| 4370 | now = cpu_clock(cpu); | 4380 | now = cpu_clock(cpu); |
| 4371 | prev = atomic64_xchg(&event->hw.prev_count, now); | 4381 | prev = local64_xchg(&event->hw.prev_count, now); |
| 4372 | atomic64_add(now - prev, &event->count); | 4382 | local64_add(now - prev, &event->count); |
| 4373 | } | 4383 | } |
| 4374 | 4384 | ||
| 4375 | static int cpu_clock_perf_event_enable(struct perf_event *event) | 4385 | static int cpu_clock_perf_event_enable(struct perf_event *event) |
| @@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) | |||
| 4377 | struct hw_perf_event *hwc = &event->hw; | 4387 | struct hw_perf_event *hwc = &event->hw; |
| 4378 | int cpu = raw_smp_processor_id(); | 4388 | int cpu = raw_smp_processor_id(); |
| 4379 | 4389 | ||
| 4380 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); | 4390 | local64_set(&hwc->prev_count, cpu_clock(cpu)); |
| 4381 | perf_swevent_start_hrtimer(event); | 4391 | perf_swevent_start_hrtimer(event); |
| 4382 | 4392 | ||
| 4383 | return 0; | 4393 | return 0; |
| @@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) | |||
| 4409 | u64 prev; | 4419 | u64 prev; |
| 4410 | s64 delta; | 4420 | s64 delta; |
| 4411 | 4421 | ||
| 4412 | prev = atomic64_xchg(&event->hw.prev_count, now); | 4422 | prev = local64_xchg(&event->hw.prev_count, now); |
| 4413 | delta = now - prev; | 4423 | delta = now - prev; |
| 4414 | atomic64_add(delta, &event->count); | 4424 | local64_add(delta, &event->count); |
| 4415 | } | 4425 | } |
| 4416 | 4426 | ||
| 4417 | static int task_clock_perf_event_enable(struct perf_event *event) | 4427 | static int task_clock_perf_event_enable(struct perf_event *event) |
| @@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) | |||
| 4421 | 4431 | ||
| 4422 | now = event->ctx->time; | 4432 | now = event->ctx->time; |
| 4423 | 4433 | ||
| 4424 | atomic64_set(&hwc->prev_count, now); | 4434 | local64_set(&hwc->prev_count, now); |
| 4425 | 4435 | ||
| 4426 | perf_swevent_start_hrtimer(event); | 4436 | perf_swevent_start_hrtimer(event); |
| 4427 | 4437 | ||
| @@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event, | |||
| 4601 | } | 4611 | } |
| 4602 | 4612 | ||
| 4603 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | 4613 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, |
| 4604 | struct pt_regs *regs, struct hlist_head *head) | 4614 | struct pt_regs *regs, struct hlist_head *head, int rctx) |
| 4605 | { | 4615 | { |
| 4606 | struct perf_sample_data data; | 4616 | struct perf_sample_data data; |
| 4607 | struct perf_event *event; | 4617 | struct perf_event *event; |
| @@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
| 4615 | perf_sample_data_init(&data, addr); | 4625 | perf_sample_data_init(&data, addr); |
| 4616 | data.raw = &raw; | 4626 | data.raw = &raw; |
| 4617 | 4627 | ||
| 4618 | rcu_read_lock(); | ||
| 4619 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4628 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 4620 | if (perf_tp_event_match(event, &data, regs)) | 4629 | if (perf_tp_event_match(event, &data, regs)) |
| 4621 | perf_swevent_add(event, count, 1, &data, regs); | 4630 | perf_swevent_add(event, count, 1, &data, regs); |
| 4622 | } | 4631 | } |
| 4623 | rcu_read_unlock(); | 4632 | |
| 4633 | perf_swevent_put_recursion_context(rctx); | ||
| 4624 | } | 4634 | } |
| 4625 | EXPORT_SYMBOL_GPL(perf_tp_event); | 4635 | EXPORT_SYMBOL_GPL(perf_tp_event); |
| 4626 | 4636 | ||
| @@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr, | |||
| 4864 | hwc->sample_period = 1; | 4874 | hwc->sample_period = 1; |
| 4865 | hwc->last_period = hwc->sample_period; | 4875 | hwc->last_period = hwc->sample_period; |
| 4866 | 4876 | ||
| 4867 | atomic64_set(&hwc->period_left, hwc->sample_period); | 4877 | local64_set(&hwc->period_left, hwc->sample_period); |
| 4868 | 4878 | ||
| 4869 | /* | 4879 | /* |
| 4870 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 4880 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
| @@ -4913,7 +4923,7 @@ done: | |||
| 4913 | 4923 | ||
| 4914 | if (!event->parent) { | 4924 | if (!event->parent) { |
| 4915 | atomic_inc(&nr_events); | 4925 | atomic_inc(&nr_events); |
| 4916 | if (event->attr.mmap) | 4926 | if (event->attr.mmap || event->attr.mmap_data) |
| 4917 | atomic_inc(&nr_mmap_events); | 4927 | atomic_inc(&nr_mmap_events); |
| 4918 | if (event->attr.comm) | 4928 | if (event->attr.comm) |
| 4919 | atomic_inc(&nr_comm_events); | 4929 | atomic_inc(&nr_comm_events); |
| @@ -5007,7 +5017,7 @@ err_size: | |||
| 5007 | static int | 5017 | static int |
| 5008 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 5018 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
| 5009 | { | 5019 | { |
| 5010 | struct perf_mmap_data *data = NULL, *old_data = NULL; | 5020 | struct perf_buffer *buffer = NULL, *old_buffer = NULL; |
| 5011 | int ret = -EINVAL; | 5021 | int ret = -EINVAL; |
| 5012 | 5022 | ||
| 5013 | if (!output_event) | 5023 | if (!output_event) |
| @@ -5037,19 +5047,19 @@ set: | |||
| 5037 | 5047 | ||
| 5038 | if (output_event) { | 5048 | if (output_event) { |
| 5039 | /* get the buffer we want to redirect to */ | 5049 | /* get the buffer we want to redirect to */ |
| 5040 | data = perf_mmap_data_get(output_event); | 5050 | buffer = perf_buffer_get(output_event); |
| 5041 | if (!data) | 5051 | if (!buffer) |
| 5042 | goto unlock; | 5052 | goto unlock; |
| 5043 | } | 5053 | } |
| 5044 | 5054 | ||
| 5045 | old_data = event->data; | 5055 | old_buffer = event->buffer; |
| 5046 | rcu_assign_pointer(event->data, data); | 5056 | rcu_assign_pointer(event->buffer, buffer); |
| 5047 | ret = 0; | 5057 | ret = 0; |
| 5048 | unlock: | 5058 | unlock: |
| 5049 | mutex_unlock(&event->mmap_mutex); | 5059 | mutex_unlock(&event->mmap_mutex); |
| 5050 | 5060 | ||
| 5051 | if (old_data) | 5061 | if (old_buffer) |
| 5052 | perf_mmap_data_put(old_data); | 5062 | perf_buffer_put(old_buffer); |
| 5053 | out: | 5063 | out: |
| 5054 | return ret; | 5064 | return ret; |
| 5055 | } | 5065 | } |
| @@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 5298 | hwc->sample_period = sample_period; | 5308 | hwc->sample_period = sample_period; |
| 5299 | hwc->last_period = sample_period; | 5309 | hwc->last_period = sample_period; |
| 5300 | 5310 | ||
| 5301 | atomic64_set(&hwc->period_left, sample_period); | 5311 | local64_set(&hwc->period_left, sample_period); |
| 5302 | } | 5312 | } |
| 5303 | 5313 | ||
| 5304 | child_event->overflow_handler = parent_event->overflow_handler; | 5314 | child_event->overflow_handler = parent_event->overflow_handler; |
| @@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event, | |||
| 5359 | if (child_event->attr.inherit_stat) | 5369 | if (child_event->attr.inherit_stat) |
| 5360 | perf_event_read_event(child_event, child); | 5370 | perf_event_read_event(child_event, child); |
| 5361 | 5371 | ||
| 5362 | child_val = atomic64_read(&child_event->count); | 5372 | child_val = perf_event_count(child_event); |
| 5363 | 5373 | ||
| 5364 | /* | 5374 | /* |
| 5365 | * Add back the child's count to the parent's count: | 5375 | * Add back the child's count to the parent's count: |
| 5366 | */ | 5376 | */ |
| 5367 | atomic64_add(child_val, &parent_event->count); | 5377 | atomic64_add(child_val, &parent_event->child_count); |
| 5368 | atomic64_add(child_event->total_time_enabled, | 5378 | atomic64_add(child_event->total_time_enabled, |
| 5369 | &parent_event->child_total_time_enabled); | 5379 | &parent_event->child_total_time_enabled); |
| 5370 | atomic64_add(child_event->total_time_running, | 5380 | atomic64_add(child_event->total_time_running, |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399c..f66bdd33a6c6 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | |||
| 232 | 232 | ||
| 233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 233 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
| 234 | { | 234 | { |
| 235 | struct sighand_struct *sighand; | 235 | struct signal_struct *sig = tsk->signal; |
| 236 | struct signal_struct *sig; | ||
| 237 | struct task_struct *t; | 236 | struct task_struct *t; |
| 238 | 237 | ||
| 239 | *times = INIT_CPUTIME; | 238 | times->utime = sig->utime; |
| 239 | times->stime = sig->stime; | ||
| 240 | times->sum_exec_runtime = sig->sum_sched_runtime; | ||
| 240 | 241 | ||
| 241 | rcu_read_lock(); | 242 | rcu_read_lock(); |
| 242 | sighand = rcu_dereference(tsk->sighand); | 243 | /* make sure we can trust tsk->thread_group list */ |
| 243 | if (!sighand) | 244 | if (!likely(pid_alive(tsk))) |
| 244 | goto out; | 245 | goto out; |
| 245 | 246 | ||
| 246 | sig = tsk->signal; | ||
| 247 | |||
| 248 | t = tsk; | 247 | t = tsk; |
| 249 | do { | 248 | do { |
| 250 | times->utime = cputime_add(times->utime, t->utime); | 249 | times->utime = cputime_add(times->utime, t->utime); |
| 251 | times->stime = cputime_add(times->stime, t->stime); | 250 | times->stime = cputime_add(times->stime, t->stime); |
| 252 | times->sum_exec_runtime += t->se.sum_exec_runtime; | 251 | times->sum_exec_runtime += t->se.sum_exec_runtime; |
| 253 | 252 | } while_each_thread(tsk, t); | |
| 254 | t = next_thread(t); | ||
| 255 | } while (t != tsk); | ||
| 256 | |||
| 257 | times->utime = cputime_add(times->utime, sig->utime); | ||
| 258 | times->stime = cputime_add(times->stime, sig->stime); | ||
| 259 | times->sum_exec_runtime += sig->sum_sched_runtime; | ||
| 260 | out: | 253 | out: |
| 261 | rcu_read_unlock(); | 254 | rcu_read_unlock(); |
| 262 | } | 255 | } |
| @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
| 1279 | { | 1272 | { |
| 1280 | struct signal_struct *sig; | 1273 | struct signal_struct *sig; |
| 1281 | 1274 | ||
| 1282 | /* tsk == current, ensure it is safe to use ->signal/sighand */ | ||
| 1283 | if (unlikely(tsk->exit_state)) | ||
| 1284 | return 0; | ||
| 1285 | |||
| 1286 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1275 | if (!task_cputime_zero(&tsk->cputime_expires)) { |
| 1287 | struct task_cputime task_sample = { | 1276 | struct task_cputime task_sample = { |
| 1288 | .utime = tsk->utime, | 1277 | .utime = tsk->utime, |
| @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
| 1298 | if (sig->cputimer.running) { | 1287 | if (sig->cputimer.running) { |
| 1299 | struct task_cputime group_sample; | 1288 | struct task_cputime group_sample; |
| 1300 | 1289 | ||
| 1301 | thread_group_cputimer(tsk, &group_sample); | 1290 | spin_lock(&sig->cputimer.lock); |
| 1291 | group_sample = sig->cputimer.cputime; | ||
| 1292 | spin_unlock(&sig->cputimer.lock); | ||
| 1293 | |||
| 1302 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1294 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
| 1303 | return 1; | 1295 | return 1; |
| 1304 | } | 1296 | } |
| @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
| 1315 | { | 1307 | { |
| 1316 | LIST_HEAD(firing); | 1308 | LIST_HEAD(firing); |
| 1317 | struct k_itimer *timer, *next; | 1309 | struct k_itimer *timer, *next; |
| 1310 | unsigned long flags; | ||
| 1318 | 1311 | ||
| 1319 | BUG_ON(!irqs_disabled()); | 1312 | BUG_ON(!irqs_disabled()); |
| 1320 | 1313 | ||
| @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
| 1325 | if (!fastpath_timer_check(tsk)) | 1318 | if (!fastpath_timer_check(tsk)) |
| 1326 | return; | 1319 | return; |
| 1327 | 1320 | ||
| 1328 | spin_lock(&tsk->sighand->siglock); | 1321 | if (!lock_task_sighand(tsk, &flags)) |
| 1322 | return; | ||
| 1329 | /* | 1323 | /* |
| 1330 | * Here we take off tsk->signal->cpu_timers[N] and | 1324 | * Here we take off tsk->signal->cpu_timers[N] and |
| 1331 | * tsk->cpu_timers[N] all the timers that are firing, and | 1325 | * tsk->cpu_timers[N] all the timers that are firing, and |
| @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
| 1347 | * that gets the timer lock before we do will give it up and | 1341 | * that gets the timer lock before we do will give it up and |
| 1348 | * spin until we've taken care of that timer below. | 1342 | * spin until we've taken care of that timer below. |
| 1349 | */ | 1343 | */ |
| 1350 | spin_unlock(&tsk->sighand->siglock); | 1344 | unlock_task_sighand(tsk, &flags); |
| 1351 | 1345 | ||
| 1352 | /* | 1346 | /* |
| 1353 | * Now that all the timers on our list have the firing flag, | 1347 | * Now that all the timers on our list have the firing flag, |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 560 | new_timer->it_clock = which_clock; | 560 | new_timer->it_clock = which_clock; |
| 561 | new_timer->it_overrun = -1; | 561 | new_timer->it_overrun = -1; |
| 562 | 562 | ||
| 563 | if (copy_to_user(created_timer_id, | ||
| 564 | &new_timer_id, sizeof (new_timer_id))) { | ||
| 565 | error = -EFAULT; | ||
| 566 | goto out; | ||
| 567 | } | ||
| 568 | if (timer_event_spec) { | 563 | if (timer_event_spec) { |
| 569 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | 564 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { |
| 570 | error = -EFAULT; | 565 | error = -EFAULT; |
| @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 590 | new_timer->sigq->info.si_tid = new_timer->it_id; | 585 | new_timer->sigq->info.si_tid = new_timer->it_id; |
| 591 | new_timer->sigq->info.si_code = SI_TIMER; | 586 | new_timer->sigq->info.si_code = SI_TIMER; |
| 592 | 587 | ||
| 588 | if (copy_to_user(created_timer_id, | ||
| 589 | &new_timer_id, sizeof (new_timer_id))) { | ||
| 590 | error = -EFAULT; | ||
| 591 | goto out; | ||
| 592 | } | ||
| 593 | |||
| 593 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 594 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); |
| 594 | if (error) | 595 | if (error) |
| 595 | goto out; | 596 | goto out; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab6..028a99598f49 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
| 16 | #include <linux/freezer.h> | 16 | #include <linux/freezer.h> |
| 17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
| 18 | #include <linux/workqueue.h> | ||
| 18 | 19 | ||
| 19 | /* | 20 | /* |
| 20 | * Timeout for stopping processes | 21 | * Timeout for stopping processes |
| @@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 35 | struct task_struct *g, *p; | 36 | struct task_struct *g, *p; |
| 36 | unsigned long end_time; | 37 | unsigned long end_time; |
| 37 | unsigned int todo; | 38 | unsigned int todo; |
| 39 | bool wq_busy = false; | ||
| 38 | struct timeval start, end; | 40 | struct timeval start, end; |
| 39 | u64 elapsed_csecs64; | 41 | u64 elapsed_csecs64; |
| 40 | unsigned int elapsed_csecs; | 42 | unsigned int elapsed_csecs; |
| @@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 42 | do_gettimeofday(&start); | 44 | do_gettimeofday(&start); |
| 43 | 45 | ||
| 44 | end_time = jiffies + TIMEOUT; | 46 | end_time = jiffies + TIMEOUT; |
| 47 | |||
| 48 | if (!sig_only) | ||
| 49 | freeze_workqueues_begin(); | ||
| 50 | |||
| 45 | while (true) { | 51 | while (true) { |
| 46 | todo = 0; | 52 | todo = 0; |
| 47 | read_lock(&tasklist_lock); | 53 | read_lock(&tasklist_lock); |
| @@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 63 | todo++; | 69 | todo++; |
| 64 | } while_each_thread(g, p); | 70 | } while_each_thread(g, p); |
| 65 | read_unlock(&tasklist_lock); | 71 | read_unlock(&tasklist_lock); |
| 72 | |||
| 73 | if (!sig_only) { | ||
| 74 | wq_busy = freeze_workqueues_busy(); | ||
| 75 | todo += wq_busy; | ||
| 76 | } | ||
| 77 | |||
| 66 | if (!todo || time_after(jiffies, end_time)) | 78 | if (!todo || time_after(jiffies, end_time)) |
| 67 | break; | 79 | break; |
| 68 | 80 | ||
| @@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only) | |||
| 86 | */ | 98 | */ |
| 87 | printk("\n"); | 99 | printk("\n"); |
| 88 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " | 100 | printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " |
| 89 | "(%d tasks refusing to freeze):\n", | 101 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
| 90 | elapsed_csecs / 100, elapsed_csecs % 100, todo); | 102 | elapsed_csecs / 100, elapsed_csecs % 100, |
| 103 | todo - wq_busy, wq_busy); | ||
| 104 | |||
| 105 | thaw_workqueues(); | ||
| 106 | |||
| 91 | read_lock(&tasklist_lock); | 107 | read_lock(&tasklist_lock); |
| 92 | do_each_thread(g, p) { | 108 | do_each_thread(g, p) { |
| 93 | task_lock(p); | 109 | task_lock(p); |
| @@ -157,6 +173,7 @@ void thaw_processes(void) | |||
| 157 | oom_killer_enable(); | 173 | oom_killer_enable(); |
| 158 | 174 | ||
| 159 | printk("Restarting tasks ... "); | 175 | printk("Restarting tasks ... "); |
| 176 | thaw_workqueues(); | ||
| 160 | thaw_tasks(true); | 177 | thaw_tasks(true); |
| 161 | thaw_tasks(false); | 178 | thaw_tasks(false); |
| 162 | schedule(); | 179 | schedule(); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f5..4d169835fb36 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) | |||
| 114 | } | 114 | } |
| 115 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | 115 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); |
| 116 | #endif /* #ifdef CONFIG_PROVE_RCU */ | 116 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
| 117 | |||
| 118 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 119 | static inline void debug_init_rcu_head(struct rcu_head *head) | ||
| 120 | { | ||
| 121 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 122 | } | ||
| 123 | |||
| 124 | static inline void debug_rcu_head_free(struct rcu_head *head) | ||
| 125 | { | ||
| 126 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 127 | } | ||
| 128 | |||
| 129 | /* | ||
| 130 | * fixup_init is called when: | ||
| 131 | * - an active object is initialized | ||
| 132 | */ | ||
| 133 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
| 134 | { | ||
| 135 | struct rcu_head *head = addr; | ||
| 136 | |||
| 137 | switch (state) { | ||
| 138 | case ODEBUG_STATE_ACTIVE: | ||
| 139 | /* | ||
| 140 | * Ensure that queued callbacks are all executed. | ||
| 141 | * If we detect that we are nested in a RCU read-side critical | ||
| 142 | * section, we should simply fail, otherwise we would deadlock. | ||
| 143 | */ | ||
| 144 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 145 | irqs_disabled()) { | ||
| 146 | WARN_ON(1); | ||
| 147 | return 0; | ||
| 148 | } | ||
| 149 | rcu_barrier(); | ||
| 150 | rcu_barrier_sched(); | ||
| 151 | rcu_barrier_bh(); | ||
| 152 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 153 | return 1; | ||
| 154 | default: | ||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | } | ||
| 158 | |||
| 159 | /* | ||
| 160 | * fixup_activate is called when: | ||
| 161 | * - an active object is activated | ||
| 162 | * - an unknown object is activated (might be a statically initialized object) | ||
| 163 | * Activation is performed internally by call_rcu(). | ||
| 164 | */ | ||
| 165 | static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | ||
| 166 | { | ||
| 167 | struct rcu_head *head = addr; | ||
| 168 | |||
| 169 | switch (state) { | ||
| 170 | |||
| 171 | case ODEBUG_STATE_NOTAVAILABLE: | ||
| 172 | /* | ||
| 173 | * This is not really a fixup. We just make sure that it is | ||
| 174 | * tracked in the object tracker. | ||
| 175 | */ | ||
| 176 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 177 | debug_object_activate(head, &rcuhead_debug_descr); | ||
| 178 | return 0; | ||
| 179 | |||
| 180 | case ODEBUG_STATE_ACTIVE: | ||
| 181 | /* | ||
| 182 | * Ensure that queued callbacks are all executed. | ||
| 183 | * If we detect that we are nested in a RCU read-side critical | ||
| 184 | * section, we should simply fail, otherwise we would deadlock. | ||
| 185 | */ | ||
| 186 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 187 | irqs_disabled()) { | ||
| 188 | WARN_ON(1); | ||
| 189 | return 0; | ||
| 190 | } | ||
| 191 | rcu_barrier(); | ||
| 192 | rcu_barrier_sched(); | ||
| 193 | rcu_barrier_bh(); | ||
| 194 | debug_object_activate(head, &rcuhead_debug_descr); | ||
| 195 | return 1; | ||
| 196 | default: | ||
| 197 | return 0; | ||
| 198 | } | ||
| 199 | } | ||
| 200 | |||
| 201 | /* | ||
| 202 | * fixup_free is called when: | ||
| 203 | * - an active object is freed | ||
| 204 | */ | ||
| 205 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
| 206 | { | ||
| 207 | struct rcu_head *head = addr; | ||
| 208 | |||
| 209 | switch (state) { | ||
| 210 | case ODEBUG_STATE_ACTIVE: | ||
| 211 | /* | ||
| 212 | * Ensure that queued callbacks are all executed. | ||
| 213 | * If we detect that we are nested in a RCU read-side critical | ||
| 214 | * section, we should simply fail, otherwise we would deadlock. | ||
| 215 | */ | ||
| 216 | #ifndef CONFIG_PREEMPT | ||
| 217 | WARN_ON(1); | ||
| 218 | return 0; | ||
| 219 | #else | ||
| 220 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 221 | irqs_disabled()) { | ||
| 222 | WARN_ON(1); | ||
| 223 | return 0; | ||
| 224 | } | ||
| 225 | rcu_barrier(); | ||
| 226 | rcu_barrier_sched(); | ||
| 227 | rcu_barrier_bh(); | ||
| 228 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 229 | return 1; | ||
| 230 | #endif | ||
| 231 | default: | ||
| 232 | return 0; | ||
| 233 | } | ||
| 234 | } | ||
| 235 | |||
| 236 | /** | ||
| 237 | * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects | ||
| 238 | * @head: pointer to rcu_head structure to be initialized | ||
| 239 | * | ||
| 240 | * This function informs debugobjects of a new rcu_head structure that | ||
| 241 | * has been allocated as an auto variable on the stack. This function | ||
| 242 | * is not required for rcu_head structures that are statically defined or | ||
| 243 | * that are dynamically allocated on the heap. This function has no | ||
| 244 | * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
| 245 | */ | ||
| 246 | void init_rcu_head_on_stack(struct rcu_head *head) | ||
| 247 | { | ||
| 248 | debug_object_init_on_stack(head, &rcuhead_debug_descr); | ||
| 249 | } | ||
| 250 | EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); | ||
| 251 | |||
| 252 | /** | ||
| 253 | * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects | ||
| 254 | * @head: pointer to rcu_head structure to be initialized | ||
| 255 | * | ||
| 256 | * This function informs debugobjects that an on-stack rcu_head structure | ||
| 257 | * is about to go out of scope. As with init_rcu_head_on_stack(), this | ||
| 258 | * function is not required for rcu_head structures that are statically | ||
| 259 | * defined or that are dynamically allocated on the heap. Also as with | ||
| 260 | * init_rcu_head_on_stack(), this function has no effect for | ||
| 261 | * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
| 262 | */ | ||
| 263 | void destroy_rcu_head_on_stack(struct rcu_head *head) | ||
| 264 | { | ||
| 265 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 266 | } | ||
| 267 | EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | ||
| 268 | |||
| 269 | struct debug_obj_descr rcuhead_debug_descr = { | ||
| 270 | .name = "rcu_head", | ||
| 271 | .fixup_init = rcuhead_fixup_init, | ||
| 272 | .fixup_activate = rcuhead_fixup_activate, | ||
| 273 | .fixup_free = rcuhead_fixup_free, | ||
| 274 | }; | ||
| 275 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | ||
| 276 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd236..196ec02f8be0 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 169 | while (list) { | 169 | while (list) { |
| 170 | next = list->next; | 170 | next = list->next; |
| 171 | prefetch(next); | 171 | prefetch(next); |
| 172 | debug_rcu_head_unqueue(list); | ||
| 172 | list->func(list); | 173 | list->func(list); |
| 173 | list = next; | 174 | list = next; |
| 174 | } | 175 | } |
| @@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, | |||
| 211 | { | 212 | { |
| 212 | unsigned long flags; | 213 | unsigned long flags; |
| 213 | 214 | ||
| 215 | debug_rcu_head_queue(head); | ||
| 214 | head->func = func; | 216 | head->func = func; |
| 215 | head->next = NULL; | 217 | head->next = NULL; |
| 216 | 218 | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a5..2e2726d790b9 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -239,8 +239,7 @@ static unsigned long | |||
| 239 | rcu_random(struct rcu_random_state *rrsp) | 239 | rcu_random(struct rcu_random_state *rrsp) |
| 240 | { | 240 | { |
| 241 | if (--rrsp->rrs_count < 0) { | 241 | if (--rrsp->rrs_count < 0) { |
| 242 | rrsp->rrs_state += | 242 | rrsp->rrs_state += (unsigned long)local_clock(); |
| 243 | (unsigned long)cpu_clock(raw_smp_processor_id()); | ||
| 244 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | 243 | rrsp->rrs_count = RCU_RANDOM_REFRESH; |
| 245 | } | 244 | } |
| 246 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | 245 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706f..d5bc43976c5a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1112 | while (list) { | 1112 | while (list) { |
| 1113 | next = list->next; | 1113 | next = list->next; |
| 1114 | prefetch(next); | 1114 | prefetch(next); |
| 1115 | debug_rcu_head_unqueue(list); | ||
| 1115 | list->func(list); | 1116 | list->func(list); |
| 1116 | list = next; | 1117 | list = next; |
| 1117 | if (++count >= rdp->blimit) | 1118 | if (++count >= rdp->blimit) |
| @@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1388 | unsigned long flags; | 1389 | unsigned long flags; |
| 1389 | struct rcu_data *rdp; | 1390 | struct rcu_data *rdp; |
| 1390 | 1391 | ||
| 1392 | debug_rcu_head_queue(head); | ||
| 1391 | head->func = func; | 1393 | head->func = func; |
| 1392 | head->next = NULL; | 1394 | head->next = NULL; |
| 1393 | 1395 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a2..41541d79e3c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -77,6 +77,7 @@ | |||
| 77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
| 78 | 78 | ||
| 79 | #include "sched_cpupri.h" | 79 | #include "sched_cpupri.h" |
| 80 | #include "workqueue_sched.h" | ||
| 80 | 81 | ||
| 81 | #define CREATE_TRACE_POINTS | 82 | #define CREATE_TRACE_POINTS |
| 82 | #include <trace/events/sched.h> | 83 | #include <trace/events/sched.h> |
| @@ -456,9 +457,10 @@ struct rq { | |||
| 456 | unsigned long nr_running; | 457 | unsigned long nr_running; |
| 457 | #define CPU_LOAD_IDX_MAX 5 | 458 | #define CPU_LOAD_IDX_MAX 5 |
| 458 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 459 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 460 | unsigned long last_load_update_tick; | ||
| 459 | #ifdef CONFIG_NO_HZ | 461 | #ifdef CONFIG_NO_HZ |
| 460 | u64 nohz_stamp; | 462 | u64 nohz_stamp; |
| 461 | unsigned char in_nohz_recently; | 463 | unsigned char nohz_balance_kick; |
| 462 | #endif | 464 | #endif |
| 463 | unsigned int skip_clock_update; | 465 | unsigned int skip_clock_update; |
| 464 | 466 | ||
| @@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) | |||
| 1193 | 1195 | ||
| 1194 | #ifdef CONFIG_NO_HZ | 1196 | #ifdef CONFIG_NO_HZ |
| 1195 | /* | 1197 | /* |
| 1198 | * In the semi idle case, use the nearest busy cpu for migrating timers | ||
| 1199 | * from an idle cpu. This is good for power-savings. | ||
| 1200 | * | ||
| 1201 | * We don't do similar optimization for completely idle system, as | ||
| 1202 | * selecting an idle cpu will add more delays to the timers than intended | ||
| 1203 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | ||
| 1204 | */ | ||
| 1205 | int get_nohz_timer_target(void) | ||
| 1206 | { | ||
| 1207 | int cpu = smp_processor_id(); | ||
| 1208 | int i; | ||
| 1209 | struct sched_domain *sd; | ||
| 1210 | |||
| 1211 | for_each_domain(cpu, sd) { | ||
| 1212 | for_each_cpu(i, sched_domain_span(sd)) | ||
| 1213 | if (!idle_cpu(i)) | ||
| 1214 | return i; | ||
| 1215 | } | ||
| 1216 | return cpu; | ||
| 1217 | } | ||
| 1218 | /* | ||
| 1196 | * When add_timer_on() enqueues a timer into the timer wheel of an | 1219 | * When add_timer_on() enqueues a timer into the timer wheel of an |
| 1197 | * idle CPU then this timer might expire before the next timer event | 1220 | * idle CPU then this timer might expire before the next timer event |
| 1198 | * which is scheduled to wake up that CPU. In case of a completely | 1221 | * which is scheduled to wake up that CPU. In case of a completely |
| @@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu) | |||
| 1232 | smp_send_reschedule(cpu); | 1255 | smp_send_reschedule(cpu); |
| 1233 | } | 1256 | } |
| 1234 | 1257 | ||
| 1235 | int nohz_ratelimit(int cpu) | ||
| 1236 | { | ||
| 1237 | struct rq *rq = cpu_rq(cpu); | ||
| 1238 | u64 diff = rq->clock - rq->nohz_stamp; | ||
| 1239 | |||
| 1240 | rq->nohz_stamp = rq->clock; | ||
| 1241 | |||
| 1242 | return diff < (NSEC_PER_SEC / HZ) >> 1; | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | #endif /* CONFIG_NO_HZ */ | 1258 | #endif /* CONFIG_NO_HZ */ |
| 1246 | 1259 | ||
| 1247 | static u64 sched_avg_period(void) | 1260 | static u64 sched_avg_period(void) |
| @@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd) | |||
| 1652 | if (root_task_group_empty()) | 1665 | if (root_task_group_empty()) |
| 1653 | return; | 1666 | return; |
| 1654 | 1667 | ||
| 1655 | now = cpu_clock(raw_smp_processor_id()); | 1668 | now = local_clock(); |
| 1656 | elapsed = now - sd->last_update; | 1669 | elapsed = now - sd->last_update; |
| 1657 | 1670 | ||
| 1658 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1671 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
| @@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
| 1805 | static void calc_load_account_idle(struct rq *this_rq); | 1818 | static void calc_load_account_idle(struct rq *this_rq); |
| 1806 | static void update_sysctl(void); | 1819 | static void update_sysctl(void); |
| 1807 | static int get_update_sysctl_factor(void); | 1820 | static int get_update_sysctl_factor(void); |
| 1821 | static void update_cpu_load(struct rq *this_rq); | ||
| 1808 | 1822 | ||
| 1809 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1823 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
| 1810 | { | 1824 | { |
| @@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 2267 | } | 2281 | } |
| 2268 | #endif | 2282 | #endif |
| 2269 | 2283 | ||
| 2270 | /*** | 2284 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, |
| 2285 | bool is_sync, bool is_migrate, bool is_local, | ||
| 2286 | unsigned long en_flags) | ||
| 2287 | { | ||
| 2288 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
| 2289 | if (is_sync) | ||
| 2290 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
| 2291 | if (is_migrate) | ||
| 2292 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
| 2293 | if (is_local) | ||
| 2294 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
| 2295 | else | ||
| 2296 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
| 2297 | |||
| 2298 | activate_task(rq, p, en_flags); | ||
| 2299 | } | ||
| 2300 | |||
| 2301 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | ||
| 2302 | int wake_flags, bool success) | ||
| 2303 | { | ||
| 2304 | trace_sched_wakeup(p, success); | ||
| 2305 | check_preempt_curr(rq, p, wake_flags); | ||
| 2306 | |||
| 2307 | p->state = TASK_RUNNING; | ||
| 2308 | #ifdef CONFIG_SMP | ||
| 2309 | if (p->sched_class->task_woken) | ||
| 2310 | p->sched_class->task_woken(rq, p); | ||
| 2311 | |||
| 2312 | if (unlikely(rq->idle_stamp)) { | ||
| 2313 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2314 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2315 | |||
| 2316 | if (delta > max) | ||
| 2317 | rq->avg_idle = max; | ||
| 2318 | else | ||
| 2319 | update_avg(&rq->avg_idle, delta); | ||
| 2320 | rq->idle_stamp = 0; | ||
| 2321 | } | ||
| 2322 | #endif | ||
| 2323 | /* if a worker is waking up, notify workqueue */ | ||
| 2324 | if ((p->flags & PF_WQ_WORKER) && success) | ||
| 2325 | wq_worker_waking_up(p, cpu_of(rq)); | ||
| 2326 | } | ||
| 2327 | |||
| 2328 | /** | ||
| 2271 | * try_to_wake_up - wake up a thread | 2329 | * try_to_wake_up - wake up a thread |
| 2272 | * @p: the to-be-woken-up thread | 2330 | * @p: the thread to be awakened |
| 2273 | * @state: the mask of task states that can be woken | 2331 | * @state: the mask of task states that can be woken |
| 2274 | * @sync: do a synchronous wakeup? | 2332 | * @wake_flags: wake modifier flags (WF_*) |
| 2275 | * | 2333 | * |
| 2276 | * Put it on the run-queue if it's not already there. The "current" | 2334 | * Put it on the run-queue if it's not already there. The "current" |
| 2277 | * thread is always on the run-queue (except when the actual | 2335 | * thread is always on the run-queue (except when the actual |
| @@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 2279 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 2337 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
| 2280 | * runnable without the overhead of this. | 2338 | * runnable without the overhead of this. |
| 2281 | * | 2339 | * |
| 2282 | * returns failure only if the task is already active. | 2340 | * Returns %true if @p was woken up, %false if it was already running |
| 2341 | * or @state didn't match @p's state. | ||
| 2283 | */ | 2342 | */ |
| 2284 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2343 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
| 2285 | int wake_flags) | 2344 | int wake_flags) |
| @@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2359 | 2418 | ||
| 2360 | out_activate: | 2419 | out_activate: |
| 2361 | #endif /* CONFIG_SMP */ | 2420 | #endif /* CONFIG_SMP */ |
| 2362 | schedstat_inc(p, se.statistics.nr_wakeups); | 2421 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, |
| 2363 | if (wake_flags & WF_SYNC) | 2422 | cpu == this_cpu, en_flags); |
| 2364 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
| 2365 | if (orig_cpu != cpu) | ||
| 2366 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
| 2367 | if (cpu == this_cpu) | ||
| 2368 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
| 2369 | else | ||
| 2370 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
| 2371 | activate_task(rq, p, en_flags); | ||
| 2372 | success = 1; | 2423 | success = 1; |
| 2373 | |||
| 2374 | out_running: | 2424 | out_running: |
| 2375 | trace_sched_wakeup(p, success); | 2425 | ttwu_post_activation(p, rq, wake_flags, success); |
| 2376 | check_preempt_curr(rq, p, wake_flags); | ||
| 2377 | |||
| 2378 | p->state = TASK_RUNNING; | ||
| 2379 | #ifdef CONFIG_SMP | ||
| 2380 | if (p->sched_class->task_woken) | ||
| 2381 | p->sched_class->task_woken(rq, p); | ||
| 2382 | |||
| 2383 | if (unlikely(rq->idle_stamp)) { | ||
| 2384 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2385 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2386 | |||
| 2387 | if (delta > max) | ||
| 2388 | rq->avg_idle = max; | ||
| 2389 | else | ||
| 2390 | update_avg(&rq->avg_idle, delta); | ||
| 2391 | rq->idle_stamp = 0; | ||
| 2392 | } | ||
| 2393 | #endif | ||
| 2394 | out: | 2426 | out: |
| 2395 | task_rq_unlock(rq, &flags); | 2427 | task_rq_unlock(rq, &flags); |
| 2396 | put_cpu(); | 2428 | put_cpu(); |
| @@ -2399,6 +2431,37 @@ out: | |||
| 2399 | } | 2431 | } |
| 2400 | 2432 | ||
| 2401 | /** | 2433 | /** |
| 2434 | * try_to_wake_up_local - try to wake up a local task with rq lock held | ||
| 2435 | * @p: the thread to be awakened | ||
| 2436 | * | ||
| 2437 | * Put @p on the run-queue if it's not alredy there. The caller must | ||
| 2438 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | ||
| 2439 | * the current task. this_rq() stays locked over invocation. | ||
| 2440 | */ | ||
| 2441 | static void try_to_wake_up_local(struct task_struct *p) | ||
| 2442 | { | ||
| 2443 | struct rq *rq = task_rq(p); | ||
| 2444 | bool success = false; | ||
| 2445 | |||
| 2446 | BUG_ON(rq != this_rq()); | ||
| 2447 | BUG_ON(p == current); | ||
| 2448 | lockdep_assert_held(&rq->lock); | ||
| 2449 | |||
| 2450 | if (!(p->state & TASK_NORMAL)) | ||
| 2451 | return; | ||
| 2452 | |||
| 2453 | if (!p->se.on_rq) { | ||
| 2454 | if (likely(!task_running(rq, p))) { | ||
| 2455 | schedstat_inc(rq, ttwu_count); | ||
| 2456 | schedstat_inc(rq, ttwu_local); | ||
| 2457 | } | ||
| 2458 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | ||
| 2459 | success = true; | ||
| 2460 | } | ||
| 2461 | ttwu_post_activation(p, rq, 0, success); | ||
| 2462 | } | ||
| 2463 | |||
| 2464 | /** | ||
| 2402 | * wake_up_process - Wake up a specific process | 2465 | * wake_up_process - Wake up a specific process |
| 2403 | * @p: The process to be woken up. | 2466 | * @p: The process to be woken up. |
| 2404 | * | 2467 | * |
| @@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 3012 | } | 3075 | } |
| 3013 | 3076 | ||
| 3014 | /* | 3077 | /* |
| 3078 | * The exact cpuload at various idx values, calculated at every tick would be | ||
| 3079 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
| 3080 | * | ||
| 3081 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
| 3082 | * on nth tick when cpu may be busy, then we have: | ||
| 3083 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 3084 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
| 3085 | * | ||
| 3086 | * decay_load_missed() below does efficient calculation of | ||
| 3087 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
| 3088 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
| 3089 | * | ||
| 3090 | * The calculation is approximated on a 128 point scale. | ||
| 3091 | * degrade_zero_ticks is the number of ticks after which load at any | ||
| 3092 | * particular idx is approximated to be zero. | ||
| 3093 | * degrade_factor is a precomputed table, a row for each load idx. | ||
| 3094 | * Each column corresponds to degradation factor for a power of two ticks, | ||
| 3095 | * based on 128 point scale. | ||
| 3096 | * Example: | ||
| 3097 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
| 3098 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
| 3099 | * | ||
| 3100 | * With this power of 2 load factors, we can degrade the load n times | ||
| 3101 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
| 3102 | * n mult/shifts needed by the exact degradation. | ||
| 3103 | */ | ||
| 3104 | #define DEGRADE_SHIFT 7 | ||
| 3105 | static const unsigned char | ||
| 3106 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
| 3107 | static const unsigned char | ||
| 3108 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
| 3109 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
| 3110 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
| 3111 | {96, 72, 40, 12, 1, 0, 0}, | ||
| 3112 | {112, 98, 75, 43, 15, 1, 0}, | ||
| 3113 | {120, 112, 98, 76, 45, 16, 2} }; | ||
| 3114 | |||
| 3115 | /* | ||
| 3116 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
| 3117 | * would be when CPU is idle and so we just decay the old load without | ||
| 3118 | * adding any new load. | ||
| 3119 | */ | ||
| 3120 | static unsigned long | ||
| 3121 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
| 3122 | { | ||
| 3123 | int j = 0; | ||
| 3124 | |||
| 3125 | if (!missed_updates) | ||
| 3126 | return load; | ||
| 3127 | |||
| 3128 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
| 3129 | return 0; | ||
| 3130 | |||
| 3131 | if (idx == 1) | ||
| 3132 | return load >> missed_updates; | ||
| 3133 | |||
| 3134 | while (missed_updates) { | ||
| 3135 | if (missed_updates % 2) | ||
| 3136 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
| 3137 | |||
| 3138 | missed_updates >>= 1; | ||
| 3139 | j++; | ||
| 3140 | } | ||
| 3141 | return load; | ||
| 3142 | } | ||
| 3143 | |||
| 3144 | /* | ||
| 3015 | * Update rq->cpu_load[] statistics. This function is usually called every | 3145 | * Update rq->cpu_load[] statistics. This function is usually called every |
| 3016 | * scheduler tick (TICK_NSEC). | 3146 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
| 3147 | * every tick. We fix it up based on jiffies. | ||
| 3017 | */ | 3148 | */ |
| 3018 | static void update_cpu_load(struct rq *this_rq) | 3149 | static void update_cpu_load(struct rq *this_rq) |
| 3019 | { | 3150 | { |
| 3020 | unsigned long this_load = this_rq->load.weight; | 3151 | unsigned long this_load = this_rq->load.weight; |
| 3152 | unsigned long curr_jiffies = jiffies; | ||
| 3153 | unsigned long pending_updates; | ||
| 3021 | int i, scale; | 3154 | int i, scale; |
| 3022 | 3155 | ||
| 3023 | this_rq->nr_load_updates++; | 3156 | this_rq->nr_load_updates++; |
| 3024 | 3157 | ||
| 3158 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
| 3159 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 3160 | return; | ||
| 3161 | |||
| 3162 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 3163 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 3164 | |||
| 3025 | /* Update our load: */ | 3165 | /* Update our load: */ |
| 3026 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 3166 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
| 3167 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
| 3027 | unsigned long old_load, new_load; | 3168 | unsigned long old_load, new_load; |
| 3028 | 3169 | ||
| 3029 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 3170 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
| 3030 | 3171 | ||
| 3031 | old_load = this_rq->cpu_load[i]; | 3172 | old_load = this_rq->cpu_load[i]; |
| 3173 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
| 3032 | new_load = this_load; | 3174 | new_load = this_load; |
| 3033 | /* | 3175 | /* |
| 3034 | * Round up the averaging division if load is increasing. This | 3176 | * Round up the averaging division if load is increasing. This |
| @@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq) | |||
| 3036 | * example. | 3178 | * example. |
| 3037 | */ | 3179 | */ |
| 3038 | if (new_load > old_load) | 3180 | if (new_load > old_load) |
| 3039 | new_load += scale-1; | 3181 | new_load += scale - 1; |
| 3040 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3182 | |
| 3183 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
| 3041 | } | 3184 | } |
| 3185 | } | ||
| 3186 | |||
| 3187 | static void update_cpu_load_active(struct rq *this_rq) | ||
| 3188 | { | ||
| 3189 | update_cpu_load(this_rq); | ||
| 3042 | 3190 | ||
| 3043 | calc_load_account_active(this_rq); | 3191 | calc_load_account_active(this_rq); |
| 3044 | } | 3192 | } |
| @@ -3426,7 +3574,7 @@ void scheduler_tick(void) | |||
| 3426 | 3574 | ||
| 3427 | raw_spin_lock(&rq->lock); | 3575 | raw_spin_lock(&rq->lock); |
| 3428 | update_rq_clock(rq); | 3576 | update_rq_clock(rq); |
| 3429 | update_cpu_load(rq); | 3577 | update_cpu_load_active(rq); |
| 3430 | curr->sched_class->task_tick(rq, curr, 0); | 3578 | curr->sched_class->task_tick(rq, curr, 0); |
| 3431 | raw_spin_unlock(&rq->lock); | 3579 | raw_spin_unlock(&rq->lock); |
| 3432 | 3580 | ||
| @@ -3598,7 +3746,6 @@ need_resched: | |||
| 3598 | rq = cpu_rq(cpu); | 3746 | rq = cpu_rq(cpu); |
| 3599 | rcu_note_context_switch(cpu); | 3747 | rcu_note_context_switch(cpu); |
| 3600 | prev = rq->curr; | 3748 | prev = rq->curr; |
| 3601 | switch_count = &prev->nivcsw; | ||
| 3602 | 3749 | ||
| 3603 | release_kernel_lock(prev); | 3750 | release_kernel_lock(prev); |
| 3604 | need_resched_nonpreemptible: | 3751 | need_resched_nonpreemptible: |
| @@ -3611,11 +3758,26 @@ need_resched_nonpreemptible: | |||
| 3611 | raw_spin_lock_irq(&rq->lock); | 3758 | raw_spin_lock_irq(&rq->lock); |
| 3612 | clear_tsk_need_resched(prev); | 3759 | clear_tsk_need_resched(prev); |
| 3613 | 3760 | ||
| 3761 | switch_count = &prev->nivcsw; | ||
| 3614 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3762 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 3615 | if (unlikely(signal_pending_state(prev->state, prev))) | 3763 | if (unlikely(signal_pending_state(prev->state, prev))) { |
| 3616 | prev->state = TASK_RUNNING; | 3764 | prev->state = TASK_RUNNING; |
| 3617 | else | 3765 | } else { |
| 3766 | /* | ||
| 3767 | * If a worker is going to sleep, notify and | ||
| 3768 | * ask workqueue whether it wants to wake up a | ||
| 3769 | * task to maintain concurrency. If so, wake | ||
| 3770 | * up the task. | ||
| 3771 | */ | ||
| 3772 | if (prev->flags & PF_WQ_WORKER) { | ||
| 3773 | struct task_struct *to_wakeup; | ||
| 3774 | |||
| 3775 | to_wakeup = wq_worker_sleeping(prev, cpu); | ||
| 3776 | if (to_wakeup) | ||
| 3777 | try_to_wake_up_local(to_wakeup); | ||
| 3778 | } | ||
| 3618 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3779 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
| 3780 | } | ||
| 3619 | switch_count = &prev->nvcsw; | 3781 | switch_count = &prev->nvcsw; |
| 3620 | } | 3782 | } |
| 3621 | 3783 | ||
| @@ -3637,8 +3799,10 @@ need_resched_nonpreemptible: | |||
| 3637 | 3799 | ||
| 3638 | context_switch(rq, prev, next); /* unlocks the rq */ | 3800 | context_switch(rq, prev, next); /* unlocks the rq */ |
| 3639 | /* | 3801 | /* |
| 3640 | * the context switch might have flipped the stack from under | 3802 | * The context switch have flipped the stack from under us |
| 3641 | * us, hence refresh the local variables. | 3803 | * and restored the local variables which were saved when |
| 3804 | * this task called schedule() in the past. prev == current | ||
| 3805 | * is still correct, but it can be moved to another cpu/rq. | ||
| 3642 | */ | 3806 | */ |
| 3643 | cpu = smp_processor_id(); | 3807 | cpu = smp_processor_id(); |
| 3644 | rq = cpu_rq(cpu); | 3808 | rq = cpu_rq(cpu); |
| @@ -3647,11 +3811,8 @@ need_resched_nonpreemptible: | |||
| 3647 | 3811 | ||
| 3648 | post_schedule(rq); | 3812 | post_schedule(rq); |
| 3649 | 3813 | ||
| 3650 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3814 | if (unlikely(reacquire_kernel_lock(prev))) |
| 3651 | prev = rq->curr; | ||
| 3652 | switch_count = &prev->nivcsw; | ||
| 3653 | goto need_resched_nonpreemptible; | 3815 | goto need_resched_nonpreemptible; |
| 3654 | } | ||
| 3655 | 3816 | ||
| 3656 | preempt_enable_no_resched(); | 3817 | preempt_enable_no_resched(); |
| 3657 | if (need_resched()) | 3818 | if (need_resched()) |
| @@ -3726,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
| 3726 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3887 | * off of preempt_enable. Kernel preemptions off return from interrupt |
| 3727 | * occur there and call schedule directly. | 3888 | * occur there and call schedule directly. |
| 3728 | */ | 3889 | */ |
| 3729 | asmlinkage void __sched preempt_schedule(void) | 3890 | asmlinkage void __sched notrace preempt_schedule(void) |
| 3730 | { | 3891 | { |
| 3731 | struct thread_info *ti = current_thread_info(); | 3892 | struct thread_info *ti = current_thread_info(); |
| 3732 | 3893 | ||
| @@ -3738,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void) | |||
| 3738 | return; | 3899 | return; |
| 3739 | 3900 | ||
| 3740 | do { | 3901 | do { |
| 3741 | add_preempt_count(PREEMPT_ACTIVE); | 3902 | add_preempt_count_notrace(PREEMPT_ACTIVE); |
| 3742 | schedule(); | 3903 | schedule(); |
| 3743 | sub_preempt_count(PREEMPT_ACTIVE); | 3904 | sub_preempt_count_notrace(PREEMPT_ACTIVE); |
| 3744 | 3905 | ||
| 3745 | /* | 3906 | /* |
| 3746 | * Check again in case we missed a preemption opportunity | 3907 | * Check again in case we missed a preemption opportunity |
| @@ -4441,12 +4602,8 @@ recheck: | |||
| 4441 | */ | 4602 | */ |
| 4442 | if (user && !capable(CAP_SYS_NICE)) { | 4603 | if (user && !capable(CAP_SYS_NICE)) { |
| 4443 | if (rt_policy(policy)) { | 4604 | if (rt_policy(policy)) { |
| 4444 | unsigned long rlim_rtprio; | 4605 | unsigned long rlim_rtprio = |
| 4445 | 4606 | task_rlimit(p, RLIMIT_RTPRIO); | |
| 4446 | if (!lock_task_sighand(p, &flags)) | ||
| 4447 | return -ESRCH; | ||
| 4448 | rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); | ||
| 4449 | unlock_task_sighand(p, &flags); | ||
| 4450 | 4607 | ||
| 4451 | /* can't set/change the rt policy */ | 4608 | /* can't set/change the rt policy */ |
| 4452 | if (policy != p->policy && !rlim_rtprio) | 4609 | if (policy != p->policy && !rlim_rtprio) |
| @@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5816 | */ | 5973 | */ |
| 5817 | static struct notifier_block __cpuinitdata migration_notifier = { | 5974 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 5818 | .notifier_call = migration_call, | 5975 | .notifier_call = migration_call, |
| 5819 | .priority = 10 | 5976 | .priority = CPU_PRI_MIGRATION, |
| 5820 | }; | 5977 | }; |
| 5821 | 5978 | ||
| 5979 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | ||
| 5980 | unsigned long action, void *hcpu) | ||
| 5981 | { | ||
| 5982 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 5983 | case CPU_ONLINE: | ||
| 5984 | case CPU_DOWN_FAILED: | ||
| 5985 | set_cpu_active((long)hcpu, true); | ||
| 5986 | return NOTIFY_OK; | ||
| 5987 | default: | ||
| 5988 | return NOTIFY_DONE; | ||
| 5989 | } | ||
| 5990 | } | ||
| 5991 | |||
| 5992 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | ||
| 5993 | unsigned long action, void *hcpu) | ||
| 5994 | { | ||
| 5995 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 5996 | case CPU_DOWN_PREPARE: | ||
| 5997 | set_cpu_active((long)hcpu, false); | ||
| 5998 | return NOTIFY_OK; | ||
| 5999 | default: | ||
| 6000 | return NOTIFY_DONE; | ||
| 6001 | } | ||
| 6002 | } | ||
| 6003 | |||
| 5822 | static int __init migration_init(void) | 6004 | static int __init migration_init(void) |
| 5823 | { | 6005 | { |
| 5824 | void *cpu = (void *)(long)smp_processor_id(); | 6006 | void *cpu = (void *)(long)smp_processor_id(); |
| 5825 | int err; | 6007 | int err; |
| 5826 | 6008 | ||
| 5827 | /* Start one for the boot CPU: */ | 6009 | /* Initialize migration for the boot CPU */ |
| 5828 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 6010 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
| 5829 | BUG_ON(err == NOTIFY_BAD); | 6011 | BUG_ON(err == NOTIFY_BAD); |
| 5830 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 6012 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 5831 | register_cpu_notifier(&migration_notifier); | 6013 | register_cpu_notifier(&migration_notifier); |
| 5832 | 6014 | ||
| 6015 | /* Register cpu active notifiers */ | ||
| 6016 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | ||
| 6017 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | ||
| 6018 | |||
| 5833 | return 0; | 6019 | return 0; |
| 5834 | } | 6020 | } |
| 5835 | early_initcall(migration_init); | 6021 | early_initcall(migration_init); |
| @@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 6064 | free_rootdomain(old_rd); | 6250 | free_rootdomain(old_rd); |
| 6065 | } | 6251 | } |
| 6066 | 6252 | ||
| 6067 | static int init_rootdomain(struct root_domain *rd, bool bootmem) | 6253 | static int init_rootdomain(struct root_domain *rd) |
| 6068 | { | 6254 | { |
| 6069 | gfp_t gfp = GFP_KERNEL; | ||
| 6070 | |||
| 6071 | memset(rd, 0, sizeof(*rd)); | 6255 | memset(rd, 0, sizeof(*rd)); |
| 6072 | 6256 | ||
| 6073 | if (bootmem) | 6257 | if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) |
| 6074 | gfp = GFP_NOWAIT; | ||
| 6075 | |||
| 6076 | if (!alloc_cpumask_var(&rd->span, gfp)) | ||
| 6077 | goto out; | 6258 | goto out; |
| 6078 | if (!alloc_cpumask_var(&rd->online, gfp)) | 6259 | if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) |
| 6079 | goto free_span; | 6260 | goto free_span; |
| 6080 | if (!alloc_cpumask_var(&rd->rto_mask, gfp)) | 6261 | if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) |
| 6081 | goto free_online; | 6262 | goto free_online; |
| 6082 | 6263 | ||
| 6083 | if (cpupri_init(&rd->cpupri, bootmem) != 0) | 6264 | if (cpupri_init(&rd->cpupri) != 0) |
| 6084 | goto free_rto_mask; | 6265 | goto free_rto_mask; |
| 6085 | return 0; | 6266 | return 0; |
| 6086 | 6267 | ||
| @@ -6096,7 +6277,7 @@ out: | |||
| 6096 | 6277 | ||
| 6097 | static void init_defrootdomain(void) | 6278 | static void init_defrootdomain(void) |
| 6098 | { | 6279 | { |
| 6099 | init_rootdomain(&def_root_domain, true); | 6280 | init_rootdomain(&def_root_domain); |
| 6100 | 6281 | ||
| 6101 | atomic_set(&def_root_domain.refcount, 1); | 6282 | atomic_set(&def_root_domain.refcount, 1); |
| 6102 | } | 6283 | } |
| @@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
| 6109 | if (!rd) | 6290 | if (!rd) |
| 6110 | return NULL; | 6291 | return NULL; |
| 6111 | 6292 | ||
| 6112 | if (init_rootdomain(rd, false) != 0) { | 6293 | if (init_rootdomain(rd) != 0) { |
| 6113 | kfree(rd); | 6294 | kfree(rd); |
| 6114 | return NULL; | 6295 | return NULL; |
| 6115 | } | 6296 | } |
| @@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
| 7288 | } | 7469 | } |
| 7289 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7470 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
| 7290 | 7471 | ||
| 7291 | #ifndef CONFIG_CPUSETS | ||
| 7292 | /* | 7472 | /* |
| 7293 | * Add online and remove offline CPUs from the scheduler domains. | 7473 | * Update cpusets according to cpu_active mask. If cpusets are |
| 7294 | * When cpusets are enabled they take over this function. | 7474 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
| 7475 | * around partition_sched_domains(). | ||
| 7295 | */ | 7476 | */ |
| 7296 | static int update_sched_domains(struct notifier_block *nfb, | 7477 | static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, |
| 7297 | unsigned long action, void *hcpu) | 7478 | void *hcpu) |
| 7298 | { | 7479 | { |
| 7299 | switch (action) { | 7480 | switch (action & ~CPU_TASKS_FROZEN) { |
| 7300 | case CPU_ONLINE: | 7481 | case CPU_ONLINE: |
| 7301 | case CPU_ONLINE_FROZEN: | ||
| 7302 | case CPU_DOWN_PREPARE: | ||
| 7303 | case CPU_DOWN_PREPARE_FROZEN: | ||
| 7304 | case CPU_DOWN_FAILED: | 7482 | case CPU_DOWN_FAILED: |
| 7305 | case CPU_DOWN_FAILED_FROZEN: | 7483 | cpuset_update_active_cpus(); |
| 7306 | partition_sched_domains(1, NULL, NULL); | ||
| 7307 | return NOTIFY_OK; | 7484 | return NOTIFY_OK; |
| 7485 | default: | ||
| 7486 | return NOTIFY_DONE; | ||
| 7487 | } | ||
| 7488 | } | ||
| 7308 | 7489 | ||
| 7490 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | ||
| 7491 | void *hcpu) | ||
| 7492 | { | ||
| 7493 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 7494 | case CPU_DOWN_PREPARE: | ||
| 7495 | cpuset_update_active_cpus(); | ||
| 7496 | return NOTIFY_OK; | ||
| 7309 | default: | 7497 | default: |
| 7310 | return NOTIFY_DONE; | 7498 | return NOTIFY_DONE; |
| 7311 | } | 7499 | } |
| 7312 | } | 7500 | } |
| 7313 | #endif | ||
| 7314 | 7501 | ||
| 7315 | static int update_runtime(struct notifier_block *nfb, | 7502 | static int update_runtime(struct notifier_block *nfb, |
| 7316 | unsigned long action, void *hcpu) | 7503 | unsigned long action, void *hcpu) |
| @@ -7356,10 +7543,8 @@ void __init sched_init_smp(void) | |||
| 7356 | mutex_unlock(&sched_domains_mutex); | 7543 | mutex_unlock(&sched_domains_mutex); |
| 7357 | put_online_cpus(); | 7544 | put_online_cpus(); |
| 7358 | 7545 | ||
| 7359 | #ifndef CONFIG_CPUSETS | 7546 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
| 7360 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7547 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
| 7361 | hotcpu_notifier(update_sched_domains, 0); | ||
| 7362 | #endif | ||
| 7363 | 7548 | ||
| 7364 | /* RT runtime code needs to handle some hotplug events */ | 7549 | /* RT runtime code needs to handle some hotplug events */ |
| 7365 | hotcpu_notifier(update_runtime, 0); | 7550 | hotcpu_notifier(update_runtime, 0); |
| @@ -7604,6 +7789,9 @@ void __init sched_init(void) | |||
| 7604 | 7789 | ||
| 7605 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7790 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
| 7606 | rq->cpu_load[j] = 0; | 7791 | rq->cpu_load[j] = 0; |
| 7792 | |||
| 7793 | rq->last_load_update_tick = jiffies; | ||
| 7794 | |||
| 7607 | #ifdef CONFIG_SMP | 7795 | #ifdef CONFIG_SMP |
| 7608 | rq->sd = NULL; | 7796 | rq->sd = NULL; |
| 7609 | rq->rd = NULL; | 7797 | rq->rd = NULL; |
| @@ -7617,6 +7805,10 @@ void __init sched_init(void) | |||
| 7617 | rq->idle_stamp = 0; | 7805 | rq->idle_stamp = 0; |
| 7618 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 7806 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
| 7619 | rq_attach_root(rq, &def_root_domain); | 7807 | rq_attach_root(rq, &def_root_domain); |
| 7808 | #ifdef CONFIG_NO_HZ | ||
| 7809 | rq->nohz_balance_kick = 0; | ||
| 7810 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
| 7811 | #endif | ||
| 7620 | #endif | 7812 | #endif |
| 7621 | init_rq_hrtick(rq); | 7813 | init_rq_hrtick(rq); |
| 7622 | atomic_set(&rq->nr_iowait, 0); | 7814 | atomic_set(&rq->nr_iowait, 0); |
| @@ -7661,8 +7853,11 @@ void __init sched_init(void) | |||
| 7661 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7853 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 7662 | #ifdef CONFIG_SMP | 7854 | #ifdef CONFIG_SMP |
| 7663 | #ifdef CONFIG_NO_HZ | 7855 | #ifdef CONFIG_NO_HZ |
| 7664 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 7856 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
| 7665 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 7857 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
| 7858 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
| 7859 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
| 7860 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
| 7666 | #endif | 7861 | #endif |
| 7667 | /* May be allocated at isolcpus cmdline parse time */ | 7862 | /* May be allocated at isolcpus cmdline parse time */ |
| 7668 | if (cpu_isolated_map == NULL) | 7863 | if (cpu_isolated_map == NULL) |
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb3..52f1a149bfb1 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c | |||
| @@ -10,19 +10,55 @@ | |||
| 10 | * Ingo Molnar <mingo@redhat.com> | 10 | * Ingo Molnar <mingo@redhat.com> |
| 11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
| 12 | * | 12 | * |
| 13 | * Create a semi stable clock from a mixture of other events, including: | 13 | * |
| 14 | * - gtod | 14 | * What: |
| 15 | * | ||
| 16 | * cpu_clock(i) provides a fast (execution time) high resolution | ||
| 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | ||
| 18 | * is monotonic for constant i. The timestamp returned is in nanoseconds. | ||
| 19 | * | ||
| 20 | * ######################### BIG FAT WARNING ########################## | ||
| 21 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
| 22 | * # go backwards !! # | ||
| 23 | * #################################################################### | ||
| 24 | * | ||
| 25 | * There is no strict promise about the base, although it tends to start | ||
| 26 | * at 0 on boot (but people really shouldn't rely on that). | ||
| 27 | * | ||
| 28 | * cpu_clock(i) -- can be used from any context, including NMI. | ||
| 29 | * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) | ||
| 30 | * local_clock() -- is cpu_clock() on the current cpu. | ||
| 31 | * | ||
| 32 | * How: | ||
| 33 | * | ||
| 34 | * The implementation either uses sched_clock() when | ||
| 35 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | ||
| 36 | * sched_clock() is assumed to provide these properties (mostly it means | ||
| 37 | * the architecture provides a globally synchronized highres time source). | ||
| 38 | * | ||
| 39 | * Otherwise it tries to create a semi stable clock from a mixture of other | ||
| 40 | * clocks, including: | ||
| 41 | * | ||
| 42 | * - GTOD (clock monotomic) | ||
| 15 | * - sched_clock() | 43 | * - sched_clock() |
| 16 | * - explicit idle events | 44 | * - explicit idle events |
| 17 | * | 45 | * |
| 18 | * We use gtod as base and the unstable clock deltas. The deltas are filtered, | 46 | * We use GTOD as base and use sched_clock() deltas to improve resolution. The |
| 19 | * making it monotonic and keeping it within an expected window. | 47 | * deltas are filtered to provide monotonicity and keeping it within an |
| 48 | * expected window. | ||
| 20 | * | 49 | * |
| 21 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time | 50 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
| 22 | * that is otherwise invisible (TSC gets stopped). | 51 | * that is otherwise invisible (TSC gets stopped). |
| 23 | * | 52 | * |
| 24 | * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat | 53 | * |
| 25 | * consistent between cpus (never more than 2 jiffies difference). | 54 | * Notes: |
| 55 | * | ||
| 56 | * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things | ||
| 57 | * like cpufreq interrupts that can change the base clock (TSC) multiplier | ||
| 58 | * and cause funny jumps in time -- although the filtering provided by | ||
| 59 | * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it | ||
| 60 | * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on | ||
| 61 | * sched_clock(). | ||
| 26 | */ | 62 | */ |
| 27 | #include <linux/spinlock.h> | 63 | #include <linux/spinlock.h> |
| 28 | #include <linux/hardirq.h> | 64 | #include <linux/hardirq.h> |
| @@ -170,6 +206,11 @@ again: | |||
| 170 | return val; | 206 | return val; |
| 171 | } | 207 | } |
| 172 | 208 | ||
| 209 | /* | ||
| 210 | * Similar to cpu_clock(), but requires local IRQs to be disabled. | ||
| 211 | * | ||
| 212 | * See cpu_clock(). | ||
| 213 | */ | ||
| 173 | u64 sched_clock_cpu(int cpu) | 214 | u64 sched_clock_cpu(int cpu) |
| 174 | { | 215 | { |
| 175 | struct sched_clock_data *scd; | 216 | struct sched_clock_data *scd; |
| @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
| 237 | } | 278 | } |
| 238 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 279 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
| 239 | 280 | ||
| 240 | unsigned long long cpu_clock(int cpu) | 281 | /* |
| 282 | * As outlined at the top, provides a fast, high resolution, nanosecond | ||
| 283 | * time source that is monotonic per cpu argument and has bounded drift | ||
| 284 | * between cpus. | ||
| 285 | * | ||
| 286 | * ######################### BIG FAT WARNING ########################## | ||
| 287 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # | ||
| 288 | * # go backwards !! # | ||
| 289 | * #################################################################### | ||
| 290 | */ | ||
| 291 | u64 cpu_clock(int cpu) | ||
| 241 | { | 292 | { |
| 242 | unsigned long long clock; | 293 | u64 clock; |
| 243 | unsigned long flags; | 294 | unsigned long flags; |
| 244 | 295 | ||
| 245 | local_irq_save(flags); | 296 | local_irq_save(flags); |
| @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) | |||
| 249 | return clock; | 300 | return clock; |
| 250 | } | 301 | } |
| 251 | 302 | ||
| 303 | /* | ||
| 304 | * Similar to cpu_clock() for the current cpu. Time will only be observed | ||
| 305 | * to be monotonic if care is taken to only compare timestampt taken on the | ||
| 306 | * same CPU. | ||
| 307 | * | ||
| 308 | * See cpu_clock(). | ||
| 309 | */ | ||
| 310 | u64 local_clock(void) | ||
| 311 | { | ||
| 312 | u64 clock; | ||
| 313 | unsigned long flags; | ||
| 314 | |||
| 315 | local_irq_save(flags); | ||
| 316 | clock = sched_clock_cpu(smp_processor_id()); | ||
| 317 | local_irq_restore(flags); | ||
| 318 | |||
| 319 | return clock; | ||
| 320 | } | ||
| 321 | |||
| 252 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 322 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
| 253 | 323 | ||
| 254 | void sched_clock_init(void) | 324 | void sched_clock_init(void) |
| @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) | |||
| 264 | return sched_clock(); | 334 | return sched_clock(); |
| 265 | } | 335 | } |
| 266 | 336 | ||
| 267 | 337 | u64 cpu_clock(int cpu) | |
| 268 | unsigned long long cpu_clock(int cpu) | ||
| 269 | { | 338 | { |
| 270 | return sched_clock_cpu(cpu); | 339 | return sched_clock_cpu(cpu); |
| 271 | } | 340 | } |
| 272 | 341 | ||
| 342 | u64 local_clock(void) | ||
| 343 | { | ||
| 344 | return sched_clock_cpu(0); | ||
| 345 | } | ||
| 346 | |||
| 273 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 347 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
| 274 | 348 | ||
| 275 | EXPORT_SYMBOL_GPL(cpu_clock); | 349 | EXPORT_SYMBOL_GPL(cpu_clock); |
| 350 | EXPORT_SYMBOL_GPL(local_clock); | ||
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc83..2722dc1b4138 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c | |||
| @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 166 | * | 166 | * |
| 167 | * Returns: -ENOMEM if memory fails. | 167 | * Returns: -ENOMEM if memory fails. |
| 168 | */ | 168 | */ |
| 169 | int cpupri_init(struct cpupri *cp, bool bootmem) | 169 | int cpupri_init(struct cpupri *cp) |
| 170 | { | 170 | { |
| 171 | gfp_t gfp = GFP_KERNEL; | ||
| 172 | int i; | 171 | int i; |
| 173 | 172 | ||
| 174 | if (bootmem) | ||
| 175 | gfp = GFP_NOWAIT; | ||
| 176 | |||
| 177 | memset(cp, 0, sizeof(*cp)); | 173 | memset(cp, 0, sizeof(*cp)); |
| 178 | 174 | ||
| 179 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
| @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) | |||
| 181 | 177 | ||
| 182 | raw_spin_lock_init(&vec->lock); | 178 | raw_spin_lock_init(&vec->lock); |
| 183 | vec->count = 0; | 179 | vec->count = 0; |
| 184 | if (!zalloc_cpumask_var(&vec->mask, gfp)) | 180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
| 185 | goto cleanup; | 181 | goto cleanup; |
| 186 | } | 182 | } |
| 187 | 183 | ||
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95be..9fc7d386fea4 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h | |||
| @@ -27,7 +27,7 @@ struct cpupri { | |||
| 27 | int cpupri_find(struct cpupri *cp, | 27 | int cpupri_find(struct cpupri *cp, |
| 28 | struct task_struct *p, struct cpumask *lowest_mask); | 28 | struct task_struct *p, struct cpumask *lowest_mask); |
| 29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 29 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
| 30 | int cpupri_init(struct cpupri *cp, bool bootmem); | 30 | int cpupri_init(struct cpupri *cp); |
| 31 | void cpupri_cleanup(struct cpupri *cp); | 31 | void cpupri_cleanup(struct cpupri *cp); |
| 32 | #else | 32 | #else |
| 33 | #define cpupri_set(cp, cpu, pri) do { } while (0) | 33 | #define cpupri_set(cp, cpu, pri) do { } while (0) |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00d..2e1b0d17dd9b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
| 332 | PN(sysctl_sched_latency); | 332 | PN(sysctl_sched_latency); |
| 333 | PN(sysctl_sched_min_granularity); | 333 | PN(sysctl_sched_min_granularity); |
| 334 | PN(sysctl_sched_wakeup_granularity); | 334 | PN(sysctl_sched_wakeup_granularity); |
| 335 | PN(sysctl_sched_child_runs_first); | 335 | P(sysctl_sched_child_runs_first); |
| 336 | P(sysctl_sched_features); | 336 | P(sysctl_sched_features); |
| 337 | #undef PN | 337 | #undef PN |
| 338 | #undef P | 338 | #undef P |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332daa..806d1b227a21 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
| 2287 | unsigned long power = SCHED_LOAD_SCALE; | 2287 | unsigned long power = SCHED_LOAD_SCALE; |
| 2288 | struct sched_group *sdg = sd->groups; | 2288 | struct sched_group *sdg = sd->groups; |
| 2289 | 2289 | ||
| 2290 | if (sched_feat(ARCH_POWER)) | ||
| 2291 | power *= arch_scale_freq_power(sd, cpu); | ||
| 2292 | else | ||
| 2293 | power *= default_scale_freq_power(sd, cpu); | ||
| 2294 | |||
| 2295 | power >>= SCHED_LOAD_SHIFT; | ||
| 2296 | |||
| 2297 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2290 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
| 2298 | if (sched_feat(ARCH_POWER)) | 2291 | if (sched_feat(ARCH_POWER)) |
| 2299 | power *= arch_scale_smt_power(sd, cpu); | 2292 | power *= arch_scale_smt_power(sd, cpu); |
| @@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
| 2303 | power >>= SCHED_LOAD_SHIFT; | 2296 | power >>= SCHED_LOAD_SHIFT; |
| 2304 | } | 2297 | } |
| 2305 | 2298 | ||
| 2299 | sdg->cpu_power_orig = power; | ||
| 2300 | |||
| 2301 | if (sched_feat(ARCH_POWER)) | ||
| 2302 | power *= arch_scale_freq_power(sd, cpu); | ||
| 2303 | else | ||
| 2304 | power *= default_scale_freq_power(sd, cpu); | ||
| 2305 | |||
| 2306 | power >>= SCHED_LOAD_SHIFT; | ||
| 2307 | |||
| 2306 | power *= scale_rt_power(cpu); | 2308 | power *= scale_rt_power(cpu); |
| 2307 | power >>= SCHED_LOAD_SHIFT; | 2309 | power >>= SCHED_LOAD_SHIFT; |
| 2308 | 2310 | ||
| @@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
| 2335 | sdg->cpu_power = power; | 2337 | sdg->cpu_power = power; |
| 2336 | } | 2338 | } |
| 2337 | 2339 | ||
| 2340 | /* | ||
| 2341 | * Try and fix up capacity for tiny siblings, this is needed when | ||
| 2342 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | ||
| 2343 | * which on its own isn't powerful enough. | ||
| 2344 | * | ||
| 2345 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
| 2346 | */ | ||
| 2347 | static inline int | ||
| 2348 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | ||
| 2349 | { | ||
| 2350 | /* | ||
| 2351 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | ||
| 2352 | */ | ||
| 2353 | if (sd->level != SD_LV_SIBLING) | ||
| 2354 | return 0; | ||
| 2355 | |||
| 2356 | /* | ||
| 2357 | * If ~90% of the cpu_power is still there, we're good. | ||
| 2358 | */ | ||
| 2359 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | ||
| 2360 | return 1; | ||
| 2361 | |||
| 2362 | return 0; | ||
| 2363 | } | ||
| 2364 | |||
| 2338 | /** | 2365 | /** |
| 2339 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 2366 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 2340 | * @sd: The sched_domain whose statistics are to be updated. | 2367 | * @sd: The sched_domain whose statistics are to be updated. |
| @@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2400 | * domains. In the newly idle case, we will allow all the cpu's | 2427 | * domains. In the newly idle case, we will allow all the cpu's |
| 2401 | * to do the newly idle load balance. | 2428 | * to do the newly idle load balance. |
| 2402 | */ | 2429 | */ |
| 2403 | if (idle != CPU_NEWLY_IDLE && local_group && | 2430 | if (idle != CPU_NEWLY_IDLE && local_group) { |
| 2404 | balance_cpu != this_cpu) { | 2431 | if (balance_cpu != this_cpu) { |
| 2405 | *balance = 0; | 2432 | *balance = 0; |
| 2406 | return; | 2433 | return; |
| 2434 | } | ||
| 2435 | update_group_power(sd, this_cpu); | ||
| 2407 | } | 2436 | } |
| 2408 | 2437 | ||
| 2409 | update_group_power(sd, this_cpu); | ||
| 2410 | |||
| 2411 | /* Adjust by relative CPU power of the group */ | 2438 | /* Adjust by relative CPU power of the group */ |
| 2412 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2439 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 2413 | 2440 | ||
| @@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2428 | 2455 | ||
| 2429 | sgs->group_capacity = | 2456 | sgs->group_capacity = |
| 2430 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2457 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
| 2458 | if (!sgs->group_capacity) | ||
| 2459 | sgs->group_capacity = fix_small_capacity(sd, group); | ||
| 2460 | } | ||
| 2461 | |||
| 2462 | /** | ||
| 2463 | * update_sd_pick_busiest - return 1 on busiest group | ||
| 2464 | * @sd: sched_domain whose statistics are to be checked | ||
| 2465 | * @sds: sched_domain statistics | ||
| 2466 | * @sg: sched_group candidate to be checked for being the busiest | ||
| 2467 | * @sgs: sched_group statistics | ||
| 2468 | * @this_cpu: the current cpu | ||
| 2469 | * | ||
| 2470 | * Determine if @sg is a busier group than the previously selected | ||
| 2471 | * busiest group. | ||
| 2472 | */ | ||
| 2473 | static bool update_sd_pick_busiest(struct sched_domain *sd, | ||
| 2474 | struct sd_lb_stats *sds, | ||
| 2475 | struct sched_group *sg, | ||
| 2476 | struct sg_lb_stats *sgs, | ||
| 2477 | int this_cpu) | ||
| 2478 | { | ||
| 2479 | if (sgs->avg_load <= sds->max_load) | ||
| 2480 | return false; | ||
| 2481 | |||
| 2482 | if (sgs->sum_nr_running > sgs->group_capacity) | ||
| 2483 | return true; | ||
| 2484 | |||
| 2485 | if (sgs->group_imb) | ||
| 2486 | return true; | ||
| 2487 | |||
| 2488 | /* | ||
| 2489 | * ASYM_PACKING needs to move all the work to the lowest | ||
| 2490 | * numbered CPUs in the group, therefore mark all groups | ||
| 2491 | * higher than ourself as busy. | ||
| 2492 | */ | ||
| 2493 | if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && | ||
| 2494 | this_cpu < group_first_cpu(sg)) { | ||
| 2495 | if (!sds->busiest) | ||
| 2496 | return true; | ||
| 2497 | |||
| 2498 | if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) | ||
| 2499 | return true; | ||
| 2500 | } | ||
| 2501 | |||
| 2502 | return false; | ||
| 2431 | } | 2503 | } |
| 2432 | 2504 | ||
| 2433 | /** | 2505 | /** |
| @@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2435 | * @sd: sched_domain whose statistics are to be updated. | 2507 | * @sd: sched_domain whose statistics are to be updated. |
| 2436 | * @this_cpu: Cpu for which load balance is currently performed. | 2508 | * @this_cpu: Cpu for which load balance is currently performed. |
| 2437 | * @idle: Idle status of this_cpu | 2509 | * @idle: Idle status of this_cpu |
| 2438 | * @sd_idle: Idle status of the sched_domain containing group. | 2510 | * @sd_idle: Idle status of the sched_domain containing sg. |
| 2439 | * @cpus: Set of cpus considered for load balancing. | 2511 | * @cpus: Set of cpus considered for load balancing. |
| 2440 | * @balance: Should we balance. | 2512 | * @balance: Should we balance. |
| 2441 | * @sds: variable to hold the statistics for this sched_domain. | 2513 | * @sds: variable to hold the statistics for this sched_domain. |
| @@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2446 | struct sd_lb_stats *sds) | 2518 | struct sd_lb_stats *sds) |
| 2447 | { | 2519 | { |
| 2448 | struct sched_domain *child = sd->child; | 2520 | struct sched_domain *child = sd->child; |
| 2449 | struct sched_group *group = sd->groups; | 2521 | struct sched_group *sg = sd->groups; |
| 2450 | struct sg_lb_stats sgs; | 2522 | struct sg_lb_stats sgs; |
| 2451 | int load_idx, prefer_sibling = 0; | 2523 | int load_idx, prefer_sibling = 0; |
| 2452 | 2524 | ||
| @@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2459 | do { | 2531 | do { |
| 2460 | int local_group; | 2532 | int local_group; |
| 2461 | 2533 | ||
| 2462 | local_group = cpumask_test_cpu(this_cpu, | 2534 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
| 2463 | sched_group_cpus(group)); | ||
| 2464 | memset(&sgs, 0, sizeof(sgs)); | 2535 | memset(&sgs, 0, sizeof(sgs)); |
| 2465 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, | 2536 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, |
| 2466 | local_group, cpus, balance, &sgs); | 2537 | local_group, cpus, balance, &sgs); |
| 2467 | 2538 | ||
| 2468 | if (local_group && !(*balance)) | 2539 | if (local_group && !(*balance)) |
| 2469 | return; | 2540 | return; |
| 2470 | 2541 | ||
| 2471 | sds->total_load += sgs.group_load; | 2542 | sds->total_load += sgs.group_load; |
| 2472 | sds->total_pwr += group->cpu_power; | 2543 | sds->total_pwr += sg->cpu_power; |
| 2473 | 2544 | ||
| 2474 | /* | 2545 | /* |
| 2475 | * In case the child domain prefers tasks go to siblings | 2546 | * In case the child domain prefers tasks go to siblings |
| 2476 | * first, lower the group capacity to one so that we'll try | 2547 | * first, lower the sg capacity to one so that we'll try |
| 2477 | * and move all the excess tasks away. | 2548 | * and move all the excess tasks away. |
| 2478 | */ | 2549 | */ |
| 2479 | if (prefer_sibling) | 2550 | if (prefer_sibling) |
| @@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2481 | 2552 | ||
| 2482 | if (local_group) { | 2553 | if (local_group) { |
| 2483 | sds->this_load = sgs.avg_load; | 2554 | sds->this_load = sgs.avg_load; |
| 2484 | sds->this = group; | 2555 | sds->this = sg; |
| 2485 | sds->this_nr_running = sgs.sum_nr_running; | 2556 | sds->this_nr_running = sgs.sum_nr_running; |
| 2486 | sds->this_load_per_task = sgs.sum_weighted_load; | 2557 | sds->this_load_per_task = sgs.sum_weighted_load; |
| 2487 | } else if (sgs.avg_load > sds->max_load && | 2558 | } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { |
| 2488 | (sgs.sum_nr_running > sgs.group_capacity || | ||
| 2489 | sgs.group_imb)) { | ||
| 2490 | sds->max_load = sgs.avg_load; | 2559 | sds->max_load = sgs.avg_load; |
| 2491 | sds->busiest = group; | 2560 | sds->busiest = sg; |
| 2492 | sds->busiest_nr_running = sgs.sum_nr_running; | 2561 | sds->busiest_nr_running = sgs.sum_nr_running; |
| 2493 | sds->busiest_group_capacity = sgs.group_capacity; | 2562 | sds->busiest_group_capacity = sgs.group_capacity; |
| 2494 | sds->busiest_load_per_task = sgs.sum_weighted_load; | 2563 | sds->busiest_load_per_task = sgs.sum_weighted_load; |
| 2495 | sds->group_imb = sgs.group_imb; | 2564 | sds->group_imb = sgs.group_imb; |
| 2496 | } | 2565 | } |
| 2497 | 2566 | ||
| 2498 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 2567 | update_sd_power_savings_stats(sg, sds, local_group, &sgs); |
| 2499 | group = group->next; | 2568 | sg = sg->next; |
| 2500 | } while (group != sd->groups); | 2569 | } while (sg != sd->groups); |
| 2570 | } | ||
| 2571 | |||
| 2572 | int __weak arch_sd_sibling_asym_packing(void) | ||
| 2573 | { | ||
| 2574 | return 0*SD_ASYM_PACKING; | ||
| 2575 | } | ||
| 2576 | |||
| 2577 | /** | ||
| 2578 | * check_asym_packing - Check to see if the group is packed into the | ||
| 2579 | * sched doman. | ||
| 2580 | * | ||
| 2581 | * This is primarily intended to used at the sibling level. Some | ||
| 2582 | * cores like POWER7 prefer to use lower numbered SMT threads. In the | ||
| 2583 | * case of POWER7, it can move to lower SMT modes only when higher | ||
| 2584 | * threads are idle. When in lower SMT modes, the threads will | ||
| 2585 | * perform better since they share less core resources. Hence when we | ||
| 2586 | * have idle threads, we want them to be the higher ones. | ||
| 2587 | * | ||
| 2588 | * This packing function is run on idle threads. It checks to see if | ||
| 2589 | * the busiest CPU in this domain (core in the P7 case) has a higher | ||
| 2590 | * CPU number than the packing function is being run on. Here we are | ||
| 2591 | * assuming lower CPU number will be equivalent to lower a SMT thread | ||
| 2592 | * number. | ||
| 2593 | * | ||
| 2594 | * Returns 1 when packing is required and a task should be moved to | ||
| 2595 | * this CPU. The amount of the imbalance is returned in *imbalance. | ||
| 2596 | * | ||
| 2597 | * @sd: The sched_domain whose packing is to be checked. | ||
| 2598 | * @sds: Statistics of the sched_domain which is to be packed | ||
| 2599 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 2600 | * @imbalance: returns amount of imbalanced due to packing. | ||
| 2601 | */ | ||
| 2602 | static int check_asym_packing(struct sched_domain *sd, | ||
| 2603 | struct sd_lb_stats *sds, | ||
| 2604 | int this_cpu, unsigned long *imbalance) | ||
| 2605 | { | ||
| 2606 | int busiest_cpu; | ||
| 2607 | |||
| 2608 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
| 2609 | return 0; | ||
| 2610 | |||
| 2611 | if (!sds->busiest) | ||
| 2612 | return 0; | ||
| 2613 | |||
| 2614 | busiest_cpu = group_first_cpu(sds->busiest); | ||
| 2615 | if (this_cpu > busiest_cpu) | ||
| 2616 | return 0; | ||
| 2617 | |||
| 2618 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | ||
| 2619 | SCHED_LOAD_SCALE); | ||
| 2620 | return 1; | ||
| 2501 | } | 2621 | } |
| 2502 | 2622 | ||
| 2503 | /** | 2623 | /** |
| @@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2692 | if (!(*balance)) | 2812 | if (!(*balance)) |
| 2693 | goto ret; | 2813 | goto ret; |
| 2694 | 2814 | ||
| 2815 | if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && | ||
| 2816 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | ||
| 2817 | return sds.busiest; | ||
| 2818 | |||
| 2695 | if (!sds.busiest || sds.busiest_nr_running == 0) | 2819 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 2696 | goto out_balanced; | 2820 | goto out_balanced; |
| 2697 | 2821 | ||
| @@ -2726,8 +2850,9 @@ ret: | |||
| 2726 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2850 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
| 2727 | */ | 2851 | */ |
| 2728 | static struct rq * | 2852 | static struct rq * |
| 2729 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2853 | find_busiest_queue(struct sched_domain *sd, struct sched_group *group, |
| 2730 | unsigned long imbalance, const struct cpumask *cpus) | 2854 | enum cpu_idle_type idle, unsigned long imbalance, |
| 2855 | const struct cpumask *cpus) | ||
| 2731 | { | 2856 | { |
| 2732 | struct rq *busiest = NULL, *rq; | 2857 | struct rq *busiest = NULL, *rq; |
| 2733 | unsigned long max_load = 0; | 2858 | unsigned long max_load = 0; |
| @@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 2738 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 2863 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); |
| 2739 | unsigned long wl; | 2864 | unsigned long wl; |
| 2740 | 2865 | ||
| 2866 | if (!capacity) | ||
| 2867 | capacity = fix_small_capacity(sd, group); | ||
| 2868 | |||
| 2741 | if (!cpumask_test_cpu(i, cpus)) | 2869 | if (!cpumask_test_cpu(i, cpus)) |
| 2742 | continue; | 2870 | continue; |
| 2743 | 2871 | ||
| @@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
| 2777 | /* Working cpumask for load_balance and load_balance_newidle. */ | 2905 | /* Working cpumask for load_balance and load_balance_newidle. */ |
| 2778 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 2906 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
| 2779 | 2907 | ||
| 2780 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) | 2908 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, |
| 2909 | int busiest_cpu, int this_cpu) | ||
| 2781 | { | 2910 | { |
| 2782 | if (idle == CPU_NEWLY_IDLE) { | 2911 | if (idle == CPU_NEWLY_IDLE) { |
| 2912 | |||
| 2913 | /* | ||
| 2914 | * ASYM_PACKING needs to force migrate tasks from busy but | ||
| 2915 | * higher numbered CPUs in order to pack all tasks in the | ||
| 2916 | * lowest numbered CPUs. | ||
| 2917 | */ | ||
| 2918 | if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) | ||
| 2919 | return 1; | ||
| 2920 | |||
| 2783 | /* | 2921 | /* |
| 2784 | * The only task running in a non-idle cpu can be moved to this | 2922 | * The only task running in a non-idle cpu can be moved to this |
| 2785 | * cpu in an attempt to completely freeup the other CPU | 2923 | * cpu in an attempt to completely freeup the other CPU |
| @@ -2854,7 +2992,7 @@ redo: | |||
| 2854 | goto out_balanced; | 2992 | goto out_balanced; |
| 2855 | } | 2993 | } |
| 2856 | 2994 | ||
| 2857 | busiest = find_busiest_queue(group, idle, imbalance, cpus); | 2995 | busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); |
| 2858 | if (!busiest) { | 2996 | if (!busiest) { |
| 2859 | schedstat_inc(sd, lb_nobusyq[idle]); | 2997 | schedstat_inc(sd, lb_nobusyq[idle]); |
| 2860 | goto out_balanced; | 2998 | goto out_balanced; |
| @@ -2898,7 +3036,8 @@ redo: | |||
| 2898 | schedstat_inc(sd, lb_failed[idle]); | 3036 | schedstat_inc(sd, lb_failed[idle]); |
| 2899 | sd->nr_balance_failed++; | 3037 | sd->nr_balance_failed++; |
| 2900 | 3038 | ||
| 2901 | if (need_active_balance(sd, sd_idle, idle)) { | 3039 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), |
| 3040 | this_cpu)) { | ||
| 2902 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3041 | raw_spin_lock_irqsave(&busiest->lock, flags); |
| 2903 | 3042 | ||
| 2904 | /* don't kick the active_load_balance_cpu_stop, | 3043 | /* don't kick the active_load_balance_cpu_stop, |
| @@ -3093,13 +3232,40 @@ out_unlock: | |||
| 3093 | } | 3232 | } |
| 3094 | 3233 | ||
| 3095 | #ifdef CONFIG_NO_HZ | 3234 | #ifdef CONFIG_NO_HZ |
| 3235 | |||
| 3236 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
| 3237 | |||
| 3238 | static void trigger_sched_softirq(void *data) | ||
| 3239 | { | ||
| 3240 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
| 3241 | } | ||
| 3242 | |||
| 3243 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
| 3244 | { | ||
| 3245 | csd->func = trigger_sched_softirq; | ||
| 3246 | csd->info = NULL; | ||
| 3247 | csd->flags = 0; | ||
| 3248 | csd->priv = 0; | ||
| 3249 | } | ||
| 3250 | |||
| 3251 | /* | ||
| 3252 | * idle load balancing details | ||
| 3253 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
| 3254 | * entering idle. | ||
| 3255 | * - This idle load balancer CPU will also go into tickless mode when | ||
| 3256 | * it is idle, just like all other idle CPUs | ||
| 3257 | * - When one of the busy CPUs notice that there may be an idle rebalancing | ||
| 3258 | * needed, they will kick the idle load balancer, which then does idle | ||
| 3259 | * load balancing for all the idle CPUs. | ||
| 3260 | */ | ||
| 3096 | static struct { | 3261 | static struct { |
| 3097 | atomic_t load_balancer; | 3262 | atomic_t load_balancer; |
| 3098 | cpumask_var_t cpu_mask; | 3263 | atomic_t first_pick_cpu; |
| 3099 | cpumask_var_t ilb_grp_nohz_mask; | 3264 | atomic_t second_pick_cpu; |
| 3100 | } nohz ____cacheline_aligned = { | 3265 | cpumask_var_t idle_cpus_mask; |
| 3101 | .load_balancer = ATOMIC_INIT(-1), | 3266 | cpumask_var_t grp_idle_mask; |
| 3102 | }; | 3267 | unsigned long next_balance; /* in jiffy units */ |
| 3268 | } nohz ____cacheline_aligned; | ||
| 3103 | 3269 | ||
| 3104 | int get_nohz_load_balancer(void) | 3270 | int get_nohz_load_balancer(void) |
| 3105 | { | 3271 | { |
| @@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
| 3153 | */ | 3319 | */ |
| 3154 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | 3320 | static inline int is_semi_idle_group(struct sched_group *ilb_group) |
| 3155 | { | 3321 | { |
| 3156 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | 3322 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, |
| 3157 | sched_group_cpus(ilb_group)); | 3323 | sched_group_cpus(ilb_group)); |
| 3158 | 3324 | ||
| 3159 | /* | 3325 | /* |
| 3160 | * A sched_group is semi-idle when it has atleast one busy cpu | 3326 | * A sched_group is semi-idle when it has atleast one busy cpu |
| 3161 | * and atleast one idle cpu. | 3327 | * and atleast one idle cpu. |
| 3162 | */ | 3328 | */ |
| 3163 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | 3329 | if (cpumask_empty(nohz.grp_idle_mask)) |
| 3164 | return 0; | 3330 | return 0; |
| 3165 | 3331 | ||
| 3166 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | 3332 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) |
| 3167 | return 0; | 3333 | return 0; |
| 3168 | 3334 | ||
| 3169 | return 1; | 3335 | return 1; |
| @@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) | |||
| 3196 | * Optimize for the case when we have no idle CPUs or only one | 3362 | * Optimize for the case when we have no idle CPUs or only one |
| 3197 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | 3363 | * idle CPU. Don't walk the sched_domain hierarchy in such cases |
| 3198 | */ | 3364 | */ |
| 3199 | if (cpumask_weight(nohz.cpu_mask) < 2) | 3365 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
| 3200 | goto out_done; | 3366 | goto out_done; |
| 3201 | 3367 | ||
| 3202 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3368 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
| @@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) | |||
| 3204 | 3370 | ||
| 3205 | do { | 3371 | do { |
| 3206 | if (is_semi_idle_group(ilb_group)) | 3372 | if (is_semi_idle_group(ilb_group)) |
| 3207 | return cpumask_first(nohz.ilb_grp_nohz_mask); | 3373 | return cpumask_first(nohz.grp_idle_mask); |
| 3208 | 3374 | ||
| 3209 | ilb_group = ilb_group->next; | 3375 | ilb_group = ilb_group->next; |
| 3210 | 3376 | ||
| @@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) | |||
| 3212 | } | 3378 | } |
| 3213 | 3379 | ||
| 3214 | out_done: | 3380 | out_done: |
| 3215 | return cpumask_first(nohz.cpu_mask); | 3381 | return nr_cpu_ids; |
| 3216 | } | 3382 | } |
| 3217 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3383 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
| 3218 | static inline int find_new_ilb(int call_cpu) | 3384 | static inline int find_new_ilb(int call_cpu) |
| 3219 | { | 3385 | { |
| 3220 | return cpumask_first(nohz.cpu_mask); | 3386 | return nr_cpu_ids; |
| 3221 | } | 3387 | } |
| 3222 | #endif | 3388 | #endif |
| 3223 | 3389 | ||
| 3224 | /* | 3390 | /* |
| 3391 | * Kick a CPU to do the nohz balancing, if it is time for it. We pick the | ||
| 3392 | * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle | ||
| 3393 | * CPU (if there is one). | ||
| 3394 | */ | ||
| 3395 | static void nohz_balancer_kick(int cpu) | ||
| 3396 | { | ||
| 3397 | int ilb_cpu; | ||
| 3398 | |||
| 3399 | nohz.next_balance++; | ||
| 3400 | |||
| 3401 | ilb_cpu = get_nohz_load_balancer(); | ||
| 3402 | |||
| 3403 | if (ilb_cpu >= nr_cpu_ids) { | ||
| 3404 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
| 3405 | if (ilb_cpu >= nr_cpu_ids) | ||
| 3406 | return; | ||
| 3407 | } | ||
| 3408 | |||
| 3409 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | ||
| 3410 | struct call_single_data *cp; | ||
| 3411 | |||
| 3412 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | ||
| 3413 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | ||
| 3414 | __smp_call_function_single(ilb_cpu, cp, 0); | ||
| 3415 | } | ||
| 3416 | return; | ||
| 3417 | } | ||
| 3418 | |||
| 3419 | /* | ||
| 3225 | * This routine will try to nominate the ilb (idle load balancing) | 3420 | * This routine will try to nominate the ilb (idle load balancing) |
| 3226 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 3421 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
| 3227 | * load balancing on behalf of all those cpus. If all the cpus in the system | 3422 | * load balancing on behalf of all those cpus. |
| 3228 | * go into this tickless mode, then there will be no ilb owner (as there is | ||
| 3229 | * no need for one) and all the cpus will sleep till the next wakeup event | ||
| 3230 | * arrives... | ||
| 3231 | * | ||
| 3232 | * For the ilb owner, tick is not stopped. And this tick will be used | ||
| 3233 | * for idle load balancing. ilb owner will still be part of | ||
| 3234 | * nohz.cpu_mask.. | ||
| 3235 | * | 3423 | * |
| 3236 | * While stopping the tick, this cpu will become the ilb owner if there | 3424 | * When the ilb owner becomes busy, we will not have new ilb owner until some |
| 3237 | * is no other owner. And will be the owner till that cpu becomes busy | 3425 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick |
| 3238 | * or if all cpus in the system stop their ticks at which point | 3426 | * idle load balancing by kicking one of the idle CPUs. |
| 3239 | * there is no need for ilb owner. | ||
| 3240 | * | 3427 | * |
| 3241 | * When the ilb owner becomes busy, it nominates another owner, during the | 3428 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this |
| 3242 | * next busy scheduler_tick() | 3429 | * ilb owner CPU in future (when there is a need for idle load balancing on |
| 3430 | * behalf of all idle CPUs). | ||
| 3243 | */ | 3431 | */ |
| 3244 | int select_nohz_load_balancer(int stop_tick) | 3432 | void select_nohz_load_balancer(int stop_tick) |
| 3245 | { | 3433 | { |
| 3246 | int cpu = smp_processor_id(); | 3434 | int cpu = smp_processor_id(); |
| 3247 | 3435 | ||
| 3248 | if (stop_tick) { | 3436 | if (stop_tick) { |
| 3249 | cpu_rq(cpu)->in_nohz_recently = 1; | ||
| 3250 | |||
| 3251 | if (!cpu_active(cpu)) { | 3437 | if (!cpu_active(cpu)) { |
| 3252 | if (atomic_read(&nohz.load_balancer) != cpu) | 3438 | if (atomic_read(&nohz.load_balancer) != cpu) |
| 3253 | return 0; | 3439 | return; |
| 3254 | 3440 | ||
| 3255 | /* | 3441 | /* |
| 3256 | * If we are going offline and still the leader, | 3442 | * If we are going offline and still the leader, |
| 3257 | * give up! | 3443 | * give up! |
| 3258 | */ | 3444 | */ |
| 3259 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3445 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
| 3446 | nr_cpu_ids) != cpu) | ||
| 3260 | BUG(); | 3447 | BUG(); |
| 3261 | 3448 | ||
| 3262 | return 0; | 3449 | return; |
| 3263 | } | 3450 | } |
| 3264 | 3451 | ||
| 3265 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 3452 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
| 3266 | 3453 | ||
| 3267 | /* time for ilb owner also to sleep */ | 3454 | if (atomic_read(&nohz.first_pick_cpu) == cpu) |
| 3268 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { | 3455 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); |
| 3269 | if (atomic_read(&nohz.load_balancer) == cpu) | 3456 | if (atomic_read(&nohz.second_pick_cpu) == cpu) |
| 3270 | atomic_set(&nohz.load_balancer, -1); | 3457 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); |
| 3271 | return 0; | ||
| 3272 | } | ||
| 3273 | 3458 | ||
| 3274 | if (atomic_read(&nohz.load_balancer) == -1) { | 3459 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { |
| 3275 | /* make me the ilb owner */ | ||
| 3276 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | ||
| 3277 | return 1; | ||
| 3278 | } else if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 3279 | int new_ilb; | 3460 | int new_ilb; |
| 3280 | 3461 | ||
| 3281 | if (!(sched_smt_power_savings || | 3462 | /* make me the ilb owner */ |
| 3282 | sched_mc_power_savings)) | 3463 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, |
| 3283 | return 1; | 3464 | cpu) != nr_cpu_ids) |
| 3465 | return; | ||
| 3466 | |||
| 3284 | /* | 3467 | /* |
| 3285 | * Check to see if there is a more power-efficient | 3468 | * Check to see if there is a more power-efficient |
| 3286 | * ilb. | 3469 | * ilb. |
| 3287 | */ | 3470 | */ |
| 3288 | new_ilb = find_new_ilb(cpu); | 3471 | new_ilb = find_new_ilb(cpu); |
| 3289 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | 3472 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { |
| 3290 | atomic_set(&nohz.load_balancer, -1); | 3473 | atomic_set(&nohz.load_balancer, nr_cpu_ids); |
| 3291 | resched_cpu(new_ilb); | 3474 | resched_cpu(new_ilb); |
| 3292 | return 0; | 3475 | return; |
| 3293 | } | 3476 | } |
| 3294 | return 1; | 3477 | return; |
| 3295 | } | 3478 | } |
| 3296 | } else { | 3479 | } else { |
| 3297 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 3480 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) |
| 3298 | return 0; | 3481 | return; |
| 3299 | 3482 | ||
| 3300 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | 3483 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
| 3301 | 3484 | ||
| 3302 | if (atomic_read(&nohz.load_balancer) == cpu) | 3485 | if (atomic_read(&nohz.load_balancer) == cpu) |
| 3303 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3486 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, |
| 3487 | nr_cpu_ids) != cpu) | ||
| 3304 | BUG(); | 3488 | BUG(); |
| 3305 | } | 3489 | } |
| 3306 | return 0; | 3490 | return; |
| 3307 | } | 3491 | } |
| 3308 | #endif | 3492 | #endif |
| 3309 | 3493 | ||
| @@ -3385,11 +3569,102 @@ out: | |||
| 3385 | rq->next_balance = next_balance; | 3569 | rq->next_balance = next_balance; |
| 3386 | } | 3570 | } |
| 3387 | 3571 | ||
| 3572 | #ifdef CONFIG_NO_HZ | ||
| 3388 | /* | 3573 | /* |
| 3389 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3574 | * In CONFIG_NO_HZ case, the idle balance kickee will do the |
| 3390 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | ||
| 3391 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3575 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
| 3392 | */ | 3576 | */ |
| 3577 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | ||
| 3578 | { | ||
| 3579 | struct rq *this_rq = cpu_rq(this_cpu); | ||
| 3580 | struct rq *rq; | ||
| 3581 | int balance_cpu; | ||
| 3582 | |||
| 3583 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | ||
| 3584 | return; | ||
| 3585 | |||
| 3586 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | ||
| 3587 | if (balance_cpu == this_cpu) | ||
| 3588 | continue; | ||
| 3589 | |||
| 3590 | /* | ||
| 3591 | * If this cpu gets work to do, stop the load balancing | ||
| 3592 | * work being done for other cpus. Next load | ||
| 3593 | * balancing owner will pick it up. | ||
| 3594 | */ | ||
| 3595 | if (need_resched()) { | ||
| 3596 | this_rq->nohz_balance_kick = 0; | ||
| 3597 | break; | ||
| 3598 | } | ||
| 3599 | |||
| 3600 | raw_spin_lock_irq(&this_rq->lock); | ||
| 3601 | update_rq_clock(this_rq); | ||
| 3602 | update_cpu_load(this_rq); | ||
| 3603 | raw_spin_unlock_irq(&this_rq->lock); | ||
| 3604 | |||
| 3605 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
| 3606 | |||
| 3607 | rq = cpu_rq(balance_cpu); | ||
| 3608 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
| 3609 | this_rq->next_balance = rq->next_balance; | ||
| 3610 | } | ||
| 3611 | nohz.next_balance = this_rq->next_balance; | ||
| 3612 | this_rq->nohz_balance_kick = 0; | ||
| 3613 | } | ||
| 3614 | |||
| 3615 | /* | ||
| 3616 | * Current heuristic for kicking the idle load balancer | ||
| 3617 | * - first_pick_cpu is the one of the busy CPUs. It will kick | ||
| 3618 | * idle load balancer when it has more than one process active. This | ||
| 3619 | * eliminates the need for idle load balancing altogether when we have | ||
| 3620 | * only one running process in the system (common case). | ||
| 3621 | * - If there are more than one busy CPU, idle load balancer may have | ||
| 3622 | * to run for active_load_balance to happen (i.e., two busy CPUs are | ||
| 3623 | * SMT or core siblings and can run better if they move to different | ||
| 3624 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
| 3625 | * which will kick idle load balancer as soon as it has any load. | ||
| 3626 | */ | ||
| 3627 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | ||
| 3628 | { | ||
| 3629 | unsigned long now = jiffies; | ||
| 3630 | int ret; | ||
| 3631 | int first_pick_cpu, second_pick_cpu; | ||
| 3632 | |||
| 3633 | if (time_before(now, nohz.next_balance)) | ||
| 3634 | return 0; | ||
| 3635 | |||
| 3636 | if (!rq->nr_running) | ||
| 3637 | return 0; | ||
| 3638 | |||
| 3639 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | ||
| 3640 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | ||
| 3641 | |||
| 3642 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | ||
| 3643 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
| 3644 | return 0; | ||
| 3645 | |||
| 3646 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | ||
| 3647 | if (ret == nr_cpu_ids || ret == cpu) { | ||
| 3648 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
| 3649 | if (rq->nr_running > 1) | ||
| 3650 | return 1; | ||
| 3651 | } else { | ||
| 3652 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | ||
| 3653 | if (ret == nr_cpu_ids || ret == cpu) { | ||
| 3654 | if (rq->nr_running) | ||
| 3655 | return 1; | ||
| 3656 | } | ||
| 3657 | } | ||
| 3658 | return 0; | ||
| 3659 | } | ||
| 3660 | #else | ||
| 3661 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | ||
| 3662 | #endif | ||
| 3663 | |||
| 3664 | /* | ||
| 3665 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
| 3666 | * Also triggered for nohz idle balancing (with nohz_balancing_kick set). | ||
| 3667 | */ | ||
| 3393 | static void run_rebalance_domains(struct softirq_action *h) | 3668 | static void run_rebalance_domains(struct softirq_action *h) |
| 3394 | { | 3669 | { |
| 3395 | int this_cpu = smp_processor_id(); | 3670 | int this_cpu = smp_processor_id(); |
| @@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 3399 | 3674 | ||
| 3400 | rebalance_domains(this_cpu, idle); | 3675 | rebalance_domains(this_cpu, idle); |
| 3401 | 3676 | ||
| 3402 | #ifdef CONFIG_NO_HZ | ||
| 3403 | /* | 3677 | /* |
| 3404 | * If this cpu is the owner for idle load balancing, then do the | 3678 | * If this cpu has a pending nohz_balance_kick, then do the |
| 3405 | * balancing on behalf of the other idle cpus whose ticks are | 3679 | * balancing on behalf of the other idle cpus whose ticks are |
| 3406 | * stopped. | 3680 | * stopped. |
| 3407 | */ | 3681 | */ |
| 3408 | if (this_rq->idle_at_tick && | 3682 | nohz_idle_balance(this_cpu, idle); |
| 3409 | atomic_read(&nohz.load_balancer) == this_cpu) { | ||
| 3410 | struct rq *rq; | ||
| 3411 | int balance_cpu; | ||
| 3412 | |||
| 3413 | for_each_cpu(balance_cpu, nohz.cpu_mask) { | ||
| 3414 | if (balance_cpu == this_cpu) | ||
| 3415 | continue; | ||
| 3416 | |||
| 3417 | /* | ||
| 3418 | * If this cpu gets work to do, stop the load balancing | ||
| 3419 | * work being done for other cpus. Next load | ||
| 3420 | * balancing owner will pick it up. | ||
| 3421 | */ | ||
| 3422 | if (need_resched()) | ||
| 3423 | break; | ||
| 3424 | |||
| 3425 | rebalance_domains(balance_cpu, CPU_IDLE); | ||
| 3426 | |||
| 3427 | rq = cpu_rq(balance_cpu); | ||
| 3428 | if (time_after(this_rq->next_balance, rq->next_balance)) | ||
| 3429 | this_rq->next_balance = rq->next_balance; | ||
| 3430 | } | ||
| 3431 | } | ||
| 3432 | #endif | ||
| 3433 | } | 3683 | } |
| 3434 | 3684 | ||
| 3435 | static inline int on_null_domain(int cpu) | 3685 | static inline int on_null_domain(int cpu) |
| @@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) | |||
| 3439 | 3689 | ||
| 3440 | /* | 3690 | /* |
| 3441 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3691 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
| 3442 | * | ||
| 3443 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | ||
| 3444 | * idle load balancing owner or decide to stop the periodic load balancing, | ||
| 3445 | * if the whole system is idle. | ||
| 3446 | */ | 3692 | */ |
| 3447 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3693 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
| 3448 | { | 3694 | { |
| 3449 | #ifdef CONFIG_NO_HZ | ||
| 3450 | /* | ||
| 3451 | * If we were in the nohz mode recently and busy at the current | ||
| 3452 | * scheduler tick, then check if we need to nominate new idle | ||
| 3453 | * load balancer. | ||
| 3454 | */ | ||
| 3455 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | ||
| 3456 | rq->in_nohz_recently = 0; | ||
| 3457 | |||
| 3458 | if (atomic_read(&nohz.load_balancer) == cpu) { | ||
| 3459 | cpumask_clear_cpu(cpu, nohz.cpu_mask); | ||
| 3460 | atomic_set(&nohz.load_balancer, -1); | ||
| 3461 | } | ||
| 3462 | |||
| 3463 | if (atomic_read(&nohz.load_balancer) == -1) { | ||
| 3464 | int ilb = find_new_ilb(cpu); | ||
| 3465 | |||
| 3466 | if (ilb < nr_cpu_ids) | ||
| 3467 | resched_cpu(ilb); | ||
| 3468 | } | ||
| 3469 | } | ||
| 3470 | |||
| 3471 | /* | ||
| 3472 | * If this cpu is idle and doing idle load balancing for all the | ||
| 3473 | * cpus with ticks stopped, is it time for that to stop? | ||
| 3474 | */ | ||
| 3475 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | ||
| 3476 | cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | ||
| 3477 | resched_cpu(cpu); | ||
| 3478 | return; | ||
| 3479 | } | ||
| 3480 | |||
| 3481 | /* | ||
| 3482 | * If this cpu is idle and the idle load balancing is done by | ||
| 3483 | * someone else, then no need raise the SCHED_SOFTIRQ | ||
| 3484 | */ | ||
| 3485 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | ||
| 3486 | cpumask_test_cpu(cpu, nohz.cpu_mask)) | ||
| 3487 | return; | ||
| 3488 | #endif | ||
| 3489 | /* Don't need to rebalance while attached to NULL domain */ | 3695 | /* Don't need to rebalance while attached to NULL domain */ |
| 3490 | if (time_after_eq(jiffies, rq->next_balance) && | 3696 | if (time_after_eq(jiffies, rq->next_balance) && |
| 3491 | likely(!on_null_domain(cpu))) | 3697 | likely(!on_null_domain(cpu))) |
| 3492 | raise_softirq(SCHED_SOFTIRQ); | 3698 | raise_softirq(SCHED_SOFTIRQ); |
| 3699 | #ifdef CONFIG_NO_HZ | ||
| 3700 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | ||
| 3701 | nohz_balancer_kick(cpu); | ||
| 3702 | #endif | ||
| 3493 | } | 3703 | } |
| 3494 | 3704 | ||
| 3495 | static void rq_online_fair(struct rq *rq) | 3705 | static void rq_online_fair(struct rq *rq) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c6..d10c80ebb67a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
| 1663 | { | 1663 | { |
| 1664 | unsigned long soft, hard; | 1664 | unsigned long soft, hard; |
| 1665 | 1665 | ||
| 1666 | if (!p->signal) | ||
| 1667 | return; | ||
| 1668 | |||
| 1669 | /* max may change after cur was read, this will be fixed next tick */ | 1666 | /* max may change after cur was read, this will be fixed next tick */ |
| 1670 | soft = task_rlimit(p, RLIMIT_RTTIME); | 1667 | soft = task_rlimit(p, RLIMIT_RTTIME); |
| 1671 | hard = task_rlimit_max(p, RLIMIT_RTTIME); | 1668 | hard = task_rlimit_max(p, RLIMIT_RTTIME); |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b0..25c2f962f6fc 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
| 295 | static inline void account_group_user_time(struct task_struct *tsk, | 295 | static inline void account_group_user_time(struct task_struct *tsk, |
| 296 | cputime_t cputime) | 296 | cputime_t cputime) |
| 297 | { | 297 | { |
| 298 | struct thread_group_cputimer *cputimer; | 298 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
| 299 | |||
| 300 | /* tsk == current, ensure it is safe to use ->signal */ | ||
| 301 | if (unlikely(tsk->exit_state)) | ||
| 302 | return; | ||
| 303 | |||
| 304 | cputimer = &tsk->signal->cputimer; | ||
| 305 | 299 | ||
| 306 | if (!cputimer->running) | 300 | if (!cputimer->running) |
| 307 | return; | 301 | return; |
| @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
| 325 | static inline void account_group_system_time(struct task_struct *tsk, | 319 | static inline void account_group_system_time(struct task_struct *tsk, |
| 326 | cputime_t cputime) | 320 | cputime_t cputime) |
| 327 | { | 321 | { |
| 328 | struct thread_group_cputimer *cputimer; | 322 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
| 329 | |||
| 330 | /* tsk == current, ensure it is safe to use ->signal */ | ||
| 331 | if (unlikely(tsk->exit_state)) | ||
| 332 | return; | ||
| 333 | |||
| 334 | cputimer = &tsk->signal->cputimer; | ||
| 335 | 323 | ||
| 336 | if (!cputimer->running) | 324 | if (!cputimer->running) |
| 337 | return; | 325 | return; |
| @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
| 355 | static inline void account_group_exec_runtime(struct task_struct *tsk, | 343 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
| 356 | unsigned long long ns) | 344 | unsigned long long ns) |
| 357 | { | 345 | { |
| 358 | struct thread_group_cputimer *cputimer; | 346 | struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; |
| 359 | struct signal_struct *sig; | ||
| 360 | |||
| 361 | sig = tsk->signal; | ||
| 362 | /* see __exit_signal()->task_rq_unlock_wait() */ | ||
| 363 | barrier(); | ||
| 364 | if (unlikely(!sig)) | ||
| 365 | return; | ||
| 366 | |||
| 367 | cputimer = &sig->cputimer; | ||
| 368 | 347 | ||
| 369 | if (!cputimer->running) | 348 | if (!cputimer->running) |
| 370 | return; | 349 | return; |
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c43645298..000000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null | |||
| @@ -1,227 +0,0 @@ | |||
| 1 | /* Slow work debugging | ||
| 2 | * | ||
| 3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
| 4 | * Written by David Howells (dhowells@redhat.com) | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/module.h> | ||
| 13 | #include <linux/slow-work.h> | ||
| 14 | #include <linux/fs.h> | ||
| 15 | #include <linux/time.h> | ||
| 16 | #include <linux/seq_file.h> | ||
| 17 | #include "slow-work.h" | ||
| 18 | |||
| 19 | #define ITERATOR_SHIFT (BITS_PER_LONG - 4) | ||
| 20 | #define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) | ||
| 21 | #define ITERATOR_COUNTER (~ITERATOR_SELECTOR) | ||
| 22 | |||
| 23 | void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) | ||
| 24 | { | ||
| 25 | seq_puts(m, "Slow-work: New thread"); | ||
| 26 | } | ||
| 27 | |||
| 28 | /* | ||
| 29 | * Render the time mark field on a work item into a 5-char time with units plus | ||
| 30 | * a space | ||
| 31 | */ | ||
| 32 | static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) | ||
| 33 | { | ||
| 34 | struct timespec now, diff; | ||
| 35 | |||
| 36 | now = CURRENT_TIME; | ||
| 37 | diff = timespec_sub(now, work->mark); | ||
| 38 | |||
| 39 | if (diff.tv_sec < 0) | ||
| 40 | seq_puts(m, " -ve "); | ||
| 41 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) | ||
| 42 | seq_printf(m, "%3luns ", diff.tv_nsec); | ||
| 43 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) | ||
| 44 | seq_printf(m, "%3luus ", diff.tv_nsec / 1000); | ||
| 45 | else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) | ||
| 46 | seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); | ||
| 47 | else if (diff.tv_sec <= 1) | ||
| 48 | seq_puts(m, " 1s "); | ||
| 49 | else if (diff.tv_sec < 60) | ||
| 50 | seq_printf(m, "%4lus ", diff.tv_sec); | ||
| 51 | else if (diff.tv_sec < 60 * 60) | ||
| 52 | seq_printf(m, "%4lum ", diff.tv_sec / 60); | ||
| 53 | else if (diff.tv_sec < 60 * 60 * 24) | ||
| 54 | seq_printf(m, "%4luh ", diff.tv_sec / 3600); | ||
| 55 | else | ||
| 56 | seq_puts(m, "exces "); | ||
| 57 | } | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Describe a slow work item for debugfs | ||
| 61 | */ | ||
| 62 | static int slow_work_runqueue_show(struct seq_file *m, void *v) | ||
| 63 | { | ||
| 64 | struct slow_work *work; | ||
| 65 | struct list_head *p = v; | ||
| 66 | unsigned long id; | ||
| 67 | |||
| 68 | switch ((unsigned long) v) { | ||
| 69 | case 1: | ||
| 70 | seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); | ||
| 71 | return 0; | ||
| 72 | case 2: | ||
| 73 | seq_puts(m, "=== ===== ================ == ===== ==========\n"); | ||
| 74 | return 0; | ||
| 75 | |||
| 76 | case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: | ||
| 77 | id = (unsigned long) v - 3; | ||
| 78 | |||
| 79 | read_lock(&slow_work_execs_lock); | ||
| 80 | work = slow_work_execs[id]; | ||
| 81 | if (work) { | ||
| 82 | smp_read_barrier_depends(); | ||
| 83 | |||
| 84 | seq_printf(m, "%3lu %5d %16p %2lx ", | ||
| 85 | id, slow_work_pids[id], work, work->flags); | ||
| 86 | slow_work_print_mark(m, work); | ||
| 87 | |||
| 88 | if (work->ops->desc) | ||
| 89 | work->ops->desc(work, m); | ||
| 90 | seq_putc(m, '\n'); | ||
| 91 | } | ||
| 92 | read_unlock(&slow_work_execs_lock); | ||
| 93 | return 0; | ||
| 94 | |||
| 95 | default: | ||
| 96 | work = list_entry(p, struct slow_work, link); | ||
| 97 | seq_printf(m, "%3s - %16p %2lx ", | ||
| 98 | work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", | ||
| 99 | work, work->flags); | ||
| 100 | slow_work_print_mark(m, work); | ||
| 101 | |||
| 102 | if (work->ops->desc) | ||
| 103 | work->ops->desc(work, m); | ||
| 104 | seq_putc(m, '\n'); | ||
| 105 | return 0; | ||
| 106 | } | ||
| 107 | } | ||
| 108 | |||
| 109 | /* | ||
| 110 | * map the iterator to a work item | ||
| 111 | */ | ||
| 112 | static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) | ||
| 113 | { | ||
| 114 | struct list_head *p; | ||
| 115 | unsigned long count, id; | ||
| 116 | |||
| 117 | switch (*_pos >> ITERATOR_SHIFT) { | ||
| 118 | case 0x0: | ||
| 119 | if (*_pos == 0) | ||
| 120 | *_pos = 1; | ||
| 121 | if (*_pos < 3) | ||
| 122 | return (void *)(unsigned long) *_pos; | ||
| 123 | if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) | ||
| 124 | for (id = *_pos - 3; | ||
| 125 | id < SLOW_WORK_THREAD_LIMIT; | ||
| 126 | id++, (*_pos)++) | ||
| 127 | if (slow_work_execs[id]) | ||
| 128 | return (void *)(unsigned long) *_pos; | ||
| 129 | *_pos = 0x1UL << ITERATOR_SHIFT; | ||
| 130 | |||
| 131 | case 0x1: | ||
| 132 | count = *_pos & ITERATOR_COUNTER; | ||
| 133 | list_for_each(p, &slow_work_queue) { | ||
| 134 | if (count == 0) | ||
| 135 | return p; | ||
| 136 | count--; | ||
| 137 | } | ||
| 138 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
| 139 | |||
| 140 | case 0x2: | ||
| 141 | count = *_pos & ITERATOR_COUNTER; | ||
| 142 | list_for_each(p, &vslow_work_queue) { | ||
| 143 | if (count == 0) | ||
| 144 | return p; | ||
| 145 | count--; | ||
| 146 | } | ||
| 147 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
| 148 | |||
| 149 | default: | ||
| 150 | return NULL; | ||
| 151 | } | ||
| 152 | } | ||
| 153 | |||
| 154 | /* | ||
| 155 | * set up the iterator to start reading from the first line | ||
| 156 | */ | ||
| 157 | static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) | ||
| 158 | { | ||
| 159 | spin_lock_irq(&slow_work_queue_lock); | ||
| 160 | return slow_work_runqueue_index(m, _pos); | ||
| 161 | } | ||
| 162 | |||
| 163 | /* | ||
| 164 | * move to the next line | ||
| 165 | */ | ||
| 166 | static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) | ||
| 167 | { | ||
| 168 | struct list_head *p = v; | ||
| 169 | unsigned long selector = *_pos >> ITERATOR_SHIFT; | ||
| 170 | |||
| 171 | (*_pos)++; | ||
| 172 | switch (selector) { | ||
| 173 | case 0x0: | ||
| 174 | return slow_work_runqueue_index(m, _pos); | ||
| 175 | |||
| 176 | case 0x1: | ||
| 177 | if (*_pos >> ITERATOR_SHIFT == 0x1) { | ||
| 178 | p = p->next; | ||
| 179 | if (p != &slow_work_queue) | ||
| 180 | return p; | ||
| 181 | } | ||
| 182 | *_pos = 0x2UL << ITERATOR_SHIFT; | ||
| 183 | p = &vslow_work_queue; | ||
| 184 | |||
| 185 | case 0x2: | ||
| 186 | if (*_pos >> ITERATOR_SHIFT == 0x2) { | ||
| 187 | p = p->next; | ||
| 188 | if (p != &vslow_work_queue) | ||
| 189 | return p; | ||
| 190 | } | ||
| 191 | *_pos = 0x3UL << ITERATOR_SHIFT; | ||
| 192 | |||
| 193 | default: | ||
| 194 | return NULL; | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | /* | ||
| 199 | * clean up after reading | ||
| 200 | */ | ||
| 201 | static void slow_work_runqueue_stop(struct seq_file *m, void *v) | ||
| 202 | { | ||
| 203 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 204 | } | ||
| 205 | |||
| 206 | static const struct seq_operations slow_work_runqueue_ops = { | ||
| 207 | .start = slow_work_runqueue_start, | ||
| 208 | .stop = slow_work_runqueue_stop, | ||
| 209 | .next = slow_work_runqueue_next, | ||
| 210 | .show = slow_work_runqueue_show, | ||
| 211 | }; | ||
| 212 | |||
| 213 | /* | ||
| 214 | * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents | ||
| 215 | */ | ||
| 216 | static int slow_work_runqueue_open(struct inode *inode, struct file *file) | ||
| 217 | { | ||
| 218 | return seq_open(file, &slow_work_runqueue_ops); | ||
| 219 | } | ||
| 220 | |||
| 221 | const struct file_operations slow_work_runqueue_fops = { | ||
| 222 | .owner = THIS_MODULE, | ||
| 223 | .open = slow_work_runqueue_open, | ||
| 224 | .read = seq_read, | ||
| 225 | .llseek = seq_lseek, | ||
| 226 | .release = seq_release, | ||
| 227 | }; | ||
diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4f..000000000000 --- a/kernel/slow-work.c +++ /dev/null | |||
| @@ -1,1068 +0,0 @@ | |||
| 1 | /* Worker thread pool for slow items, such as filesystem lookups or mkdirs | ||
| 2 | * | ||
| 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | ||
| 4 | * Written by David Howells (dhowells@redhat.com) | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | * | ||
| 11 | * See Documentation/slow-work.txt | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/slow-work.h> | ||
| 16 | #include <linux/kthread.h> | ||
| 17 | #include <linux/freezer.h> | ||
| 18 | #include <linux/wait.h> | ||
| 19 | #include <linux/debugfs.h> | ||
| 20 | #include "slow-work.h" | ||
| 21 | |||
| 22 | static void slow_work_cull_timeout(unsigned long); | ||
| 23 | static void slow_work_oom_timeout(unsigned long); | ||
| 24 | |||
| 25 | #ifdef CONFIG_SYSCTL | ||
| 26 | static int slow_work_min_threads_sysctl(struct ctl_table *, int, | ||
| 27 | void __user *, size_t *, loff_t *); | ||
| 28 | |||
| 29 | static int slow_work_max_threads_sysctl(struct ctl_table *, int , | ||
| 30 | void __user *, size_t *, loff_t *); | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /* | ||
| 34 | * The pool of threads has at least min threads in it as long as someone is | ||
| 35 | * using the facility, and may have as many as max. | ||
| 36 | * | ||
| 37 | * A portion of the pool may be processing very slow operations. | ||
| 38 | */ | ||
| 39 | static unsigned slow_work_min_threads = 2; | ||
| 40 | static unsigned slow_work_max_threads = 4; | ||
| 41 | static unsigned vslow_work_proportion = 50; /* % of threads that may process | ||
| 42 | * very slow work */ | ||
| 43 | |||
| 44 | #ifdef CONFIG_SYSCTL | ||
| 45 | static const int slow_work_min_min_threads = 2; | ||
| 46 | static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; | ||
| 47 | static const int slow_work_min_vslow = 1; | ||
| 48 | static const int slow_work_max_vslow = 99; | ||
| 49 | |||
| 50 | ctl_table slow_work_sysctls[] = { | ||
| 51 | { | ||
| 52 | .procname = "min-threads", | ||
| 53 | .data = &slow_work_min_threads, | ||
| 54 | .maxlen = sizeof(unsigned), | ||
| 55 | .mode = 0644, | ||
| 56 | .proc_handler = slow_work_min_threads_sysctl, | ||
| 57 | .extra1 = (void *) &slow_work_min_min_threads, | ||
| 58 | .extra2 = &slow_work_max_threads, | ||
| 59 | }, | ||
| 60 | { | ||
| 61 | .procname = "max-threads", | ||
| 62 | .data = &slow_work_max_threads, | ||
| 63 | .maxlen = sizeof(unsigned), | ||
| 64 | .mode = 0644, | ||
| 65 | .proc_handler = slow_work_max_threads_sysctl, | ||
| 66 | .extra1 = &slow_work_min_threads, | ||
| 67 | .extra2 = (void *) &slow_work_max_max_threads, | ||
| 68 | }, | ||
| 69 | { | ||
| 70 | .procname = "vslow-percentage", | ||
| 71 | .data = &vslow_work_proportion, | ||
| 72 | .maxlen = sizeof(unsigned), | ||
| 73 | .mode = 0644, | ||
| 74 | .proc_handler = proc_dointvec_minmax, | ||
| 75 | .extra1 = (void *) &slow_work_min_vslow, | ||
| 76 | .extra2 = (void *) &slow_work_max_vslow, | ||
| 77 | }, | ||
| 78 | {} | ||
| 79 | }; | ||
| 80 | #endif | ||
| 81 | |||
| 82 | /* | ||
| 83 | * The active state of the thread pool | ||
| 84 | */ | ||
| 85 | static atomic_t slow_work_thread_count; | ||
| 86 | static atomic_t vslow_work_executing_count; | ||
| 87 | |||
| 88 | static bool slow_work_may_not_start_new_thread; | ||
| 89 | static bool slow_work_cull; /* cull a thread due to lack of activity */ | ||
| 90 | static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); | ||
| 91 | static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); | ||
| 92 | static struct slow_work slow_work_new_thread; /* new thread starter */ | ||
| 93 | |||
| 94 | /* | ||
| 95 | * slow work ID allocation (use slow_work_queue_lock) | ||
| 96 | */ | ||
| 97 | static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Unregistration tracking to prevent put_ref() from disappearing during module | ||
| 101 | * unload | ||
| 102 | */ | ||
| 103 | #ifdef CONFIG_MODULES | ||
| 104 | static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; | ||
| 105 | static struct module *slow_work_unreg_module; | ||
| 106 | static struct slow_work *slow_work_unreg_work_item; | ||
| 107 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); | ||
| 108 | static DEFINE_MUTEX(slow_work_unreg_sync_lock); | ||
| 109 | |||
| 110 | static void slow_work_set_thread_processing(int id, struct slow_work *work) | ||
| 111 | { | ||
| 112 | if (work) | ||
| 113 | slow_work_thread_processing[id] = work->owner; | ||
| 114 | } | ||
| 115 | static void slow_work_done_thread_processing(int id, struct slow_work *work) | ||
| 116 | { | ||
| 117 | struct module *module = slow_work_thread_processing[id]; | ||
| 118 | |||
| 119 | slow_work_thread_processing[id] = NULL; | ||
| 120 | smp_mb(); | ||
| 121 | if (slow_work_unreg_work_item == work || | ||
| 122 | slow_work_unreg_module == module) | ||
| 123 | wake_up_all(&slow_work_unreg_wq); | ||
| 124 | } | ||
| 125 | static void slow_work_clear_thread_processing(int id) | ||
| 126 | { | ||
| 127 | slow_work_thread_processing[id] = NULL; | ||
| 128 | } | ||
| 129 | #else | ||
| 130 | static void slow_work_set_thread_processing(int id, struct slow_work *work) {} | ||
| 131 | static void slow_work_done_thread_processing(int id, struct slow_work *work) {} | ||
| 132 | static void slow_work_clear_thread_processing(int id) {} | ||
| 133 | #endif | ||
| 134 | |||
| 135 | /* | ||
| 136 | * Data for tracking currently executing items for indication through /proc | ||
| 137 | */ | ||
| 138 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 139 | struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; | ||
| 140 | pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; | ||
| 141 | DEFINE_RWLOCK(slow_work_execs_lock); | ||
| 142 | #endif | ||
| 143 | |||
| 144 | /* | ||
| 145 | * The queues of work items and the lock governing access to them. These are | ||
| 146 | * shared between all the CPUs. It doesn't make sense to have per-CPU queues | ||
| 147 | * as the number of threads bears no relation to the number of CPUs. | ||
| 148 | * | ||
| 149 | * There are two queues of work items: one for slow work items, and one for | ||
| 150 | * very slow work items. | ||
| 151 | */ | ||
| 152 | LIST_HEAD(slow_work_queue); | ||
| 153 | LIST_HEAD(vslow_work_queue); | ||
| 154 | DEFINE_SPINLOCK(slow_work_queue_lock); | ||
| 155 | |||
| 156 | /* | ||
| 157 | * The following are two wait queues that get pinged when a work item is placed | ||
| 158 | * on an empty queue. These allow work items that are hogging a thread by | ||
| 159 | * sleeping in a way that could be deferred to yield their thread and enqueue | ||
| 160 | * themselves. | ||
| 161 | */ | ||
| 162 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); | ||
| 163 | static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); | ||
| 164 | |||
| 165 | /* | ||
| 166 | * The thread controls. A variable used to signal to the threads that they | ||
| 167 | * should exit when the queue is empty, a waitqueue used by the threads to wait | ||
| 168 | * for signals, and a completion set by the last thread to exit. | ||
| 169 | */ | ||
| 170 | static bool slow_work_threads_should_exit; | ||
| 171 | static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); | ||
| 172 | static DECLARE_COMPLETION(slow_work_last_thread_exited); | ||
| 173 | |||
| 174 | /* | ||
| 175 | * The number of users of the thread pool and its lock. Whilst this is zero we | ||
| 176 | * have no threads hanging around, and when this reaches zero, we wait for all | ||
| 177 | * active or queued work items to complete and kill all the threads we do have. | ||
| 178 | */ | ||
| 179 | static int slow_work_user_count; | ||
| 180 | static DEFINE_MUTEX(slow_work_user_lock); | ||
| 181 | |||
| 182 | static inline int slow_work_get_ref(struct slow_work *work) | ||
| 183 | { | ||
| 184 | if (work->ops->get_ref) | ||
| 185 | return work->ops->get_ref(work); | ||
| 186 | |||
| 187 | return 0; | ||
| 188 | } | ||
| 189 | |||
| 190 | static inline void slow_work_put_ref(struct slow_work *work) | ||
| 191 | { | ||
| 192 | if (work->ops->put_ref) | ||
| 193 | work->ops->put_ref(work); | ||
| 194 | } | ||
| 195 | |||
| 196 | /* | ||
| 197 | * Calculate the maximum number of active threads in the pool that are | ||
| 198 | * permitted to process very slow work items. | ||
| 199 | * | ||
| 200 | * The answer is rounded up to at least 1, but may not equal or exceed the | ||
| 201 | * maximum number of the threads in the pool. This means we always have at | ||
| 202 | * least one thread that can process slow work items, and we always have at | ||
| 203 | * least one thread that won't get tied up doing so. | ||
| 204 | */ | ||
| 205 | static unsigned slow_work_calc_vsmax(void) | ||
| 206 | { | ||
| 207 | unsigned vsmax; | ||
| 208 | |||
| 209 | vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; | ||
| 210 | vsmax /= 100; | ||
| 211 | vsmax = max(vsmax, 1U); | ||
| 212 | return min(vsmax, slow_work_max_threads - 1); | ||
| 213 | } | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Attempt to execute stuff queued on a slow thread. Return true if we managed | ||
| 217 | * it, false if there was nothing to do. | ||
| 218 | */ | ||
| 219 | static noinline bool slow_work_execute(int id) | ||
| 220 | { | ||
| 221 | struct slow_work *work = NULL; | ||
| 222 | unsigned vsmax; | ||
| 223 | bool very_slow; | ||
| 224 | |||
| 225 | vsmax = slow_work_calc_vsmax(); | ||
| 226 | |||
| 227 | /* see if we can schedule a new thread to be started if we're not | ||
| 228 | * keeping up with the work */ | ||
| 229 | if (!waitqueue_active(&slow_work_thread_wq) && | ||
| 230 | (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && | ||
| 231 | atomic_read(&slow_work_thread_count) < slow_work_max_threads && | ||
| 232 | !slow_work_may_not_start_new_thread) | ||
| 233 | slow_work_enqueue(&slow_work_new_thread); | ||
| 234 | |||
| 235 | /* find something to execute */ | ||
| 236 | spin_lock_irq(&slow_work_queue_lock); | ||
| 237 | if (!list_empty(&vslow_work_queue) && | ||
| 238 | atomic_read(&vslow_work_executing_count) < vsmax) { | ||
| 239 | work = list_entry(vslow_work_queue.next, | ||
| 240 | struct slow_work, link); | ||
| 241 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
| 242 | BUG(); | ||
| 243 | list_del_init(&work->link); | ||
| 244 | atomic_inc(&vslow_work_executing_count); | ||
| 245 | very_slow = true; | ||
| 246 | } else if (!list_empty(&slow_work_queue)) { | ||
| 247 | work = list_entry(slow_work_queue.next, | ||
| 248 | struct slow_work, link); | ||
| 249 | if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) | ||
| 250 | BUG(); | ||
| 251 | list_del_init(&work->link); | ||
| 252 | very_slow = false; | ||
| 253 | } else { | ||
| 254 | very_slow = false; /* avoid the compiler warning */ | ||
| 255 | } | ||
| 256 | |||
| 257 | slow_work_set_thread_processing(id, work); | ||
| 258 | if (work) { | ||
| 259 | slow_work_mark_time(work); | ||
| 260 | slow_work_begin_exec(id, work); | ||
| 261 | } | ||
| 262 | |||
| 263 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 264 | |||
| 265 | if (!work) | ||
| 266 | return false; | ||
| 267 | |||
| 268 | if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) | ||
| 269 | BUG(); | ||
| 270 | |||
| 271 | /* don't execute if the work is in the process of being cancelled */ | ||
| 272 | if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
| 273 | work->ops->execute(work); | ||
| 274 | |||
| 275 | if (very_slow) | ||
| 276 | atomic_dec(&vslow_work_executing_count); | ||
| 277 | clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); | ||
| 278 | |||
| 279 | /* wake up anyone waiting for this work to be complete */ | ||
| 280 | wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); | ||
| 281 | |||
| 282 | slow_work_end_exec(id, work); | ||
| 283 | |||
| 284 | /* if someone tried to enqueue the item whilst we were executing it, | ||
| 285 | * then it'll be left unenqueued to avoid multiple threads trying to | ||
| 286 | * execute it simultaneously | ||
| 287 | * | ||
| 288 | * there is, however, a race between us testing the pending flag and | ||
| 289 | * getting the spinlock, and between the enqueuer setting the pending | ||
| 290 | * flag and getting the spinlock, so we use a deferral bit to tell us | ||
| 291 | * if the enqueuer got there first | ||
| 292 | */ | ||
| 293 | if (test_bit(SLOW_WORK_PENDING, &work->flags)) { | ||
| 294 | spin_lock_irq(&slow_work_queue_lock); | ||
| 295 | |||
| 296 | if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && | ||
| 297 | test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) | ||
| 298 | goto auto_requeue; | ||
| 299 | |||
| 300 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 301 | } | ||
| 302 | |||
| 303 | /* sort out the race between module unloading and put_ref() */ | ||
| 304 | slow_work_put_ref(work); | ||
| 305 | slow_work_done_thread_processing(id, work); | ||
| 306 | |||
| 307 | return true; | ||
| 308 | |||
| 309 | auto_requeue: | ||
| 310 | /* we must complete the enqueue operation | ||
| 311 | * - we transfer our ref on the item back to the appropriate queue | ||
| 312 | * - don't wake another thread up as we're awake already | ||
| 313 | */ | ||
| 314 | slow_work_mark_time(work); | ||
| 315 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) | ||
| 316 | list_add_tail(&work->link, &vslow_work_queue); | ||
| 317 | else | ||
| 318 | list_add_tail(&work->link, &slow_work_queue); | ||
| 319 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 320 | slow_work_clear_thread_processing(id); | ||
| 321 | return true; | ||
| 322 | } | ||
| 323 | |||
| 324 | /** | ||
| 325 | * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work | ||
| 326 | * work: The work item under execution that wants to sleep | ||
| 327 | * _timeout: Scheduler sleep timeout | ||
| 328 | * | ||
| 329 | * Allow a requeueable work item to sleep on a slow-work processor thread until | ||
| 330 | * that thread is needed to do some other work or the sleep is interrupted by | ||
| 331 | * some other event. | ||
| 332 | * | ||
| 333 | * The caller must set up a wake up event before calling this and must have set | ||
| 334 | * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own | ||
| 335 | * condition before calling this function as no test is made here. | ||
| 336 | * | ||
| 337 | * False is returned if there is nothing on the queue; true is returned if the | ||
| 338 | * work item should be requeued | ||
| 339 | */ | ||
| 340 | bool slow_work_sleep_till_thread_needed(struct slow_work *work, | ||
| 341 | signed long *_timeout) | ||
| 342 | { | ||
| 343 | wait_queue_head_t *wfo_wq; | ||
| 344 | struct list_head *queue; | ||
| 345 | |||
| 346 | DEFINE_WAIT(wait); | ||
| 347 | |||
| 348 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
| 349 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
| 350 | queue = &vslow_work_queue; | ||
| 351 | } else { | ||
| 352 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
| 353 | queue = &slow_work_queue; | ||
| 354 | } | ||
| 355 | |||
| 356 | if (!list_empty(queue)) | ||
| 357 | return true; | ||
| 358 | |||
| 359 | add_wait_queue_exclusive(wfo_wq, &wait); | ||
| 360 | if (list_empty(queue)) | ||
| 361 | *_timeout = schedule_timeout(*_timeout); | ||
| 362 | finish_wait(wfo_wq, &wait); | ||
| 363 | |||
| 364 | return !list_empty(queue); | ||
| 365 | } | ||
| 366 | EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); | ||
| 367 | |||
| 368 | /** | ||
| 369 | * slow_work_enqueue - Schedule a slow work item for processing | ||
| 370 | * @work: The work item to queue | ||
| 371 | * | ||
| 372 | * Schedule a slow work item for processing. If the item is already undergoing | ||
| 373 | * execution, this guarantees not to re-enter the execution routine until the | ||
| 374 | * first execution finishes. | ||
| 375 | * | ||
| 376 | * The item is pinned by this function as it retains a reference to it, managed | ||
| 377 | * through the item operations. The item is unpinned once it has been | ||
| 378 | * executed. | ||
| 379 | * | ||
| 380 | * An item may hog the thread that is running it for a relatively large amount | ||
| 381 | * of time, sufficient, for example, to perform several lookup, mkdir, create | ||
| 382 | * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. | ||
| 383 | * | ||
| 384 | * Conversely, if a number of items are awaiting processing, it may take some | ||
| 385 | * time before any given item is given attention. The number of threads in the | ||
| 386 | * pool may be increased to deal with demand, but only up to a limit. | ||
| 387 | * | ||
| 388 | * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in | ||
| 389 | * the very slow queue, from which only a portion of the threads will be | ||
| 390 | * allowed to pick items to execute. This ensures that very slow items won't | ||
| 391 | * overly block ones that are just ordinarily slow. | ||
| 392 | * | ||
| 393 | * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is | ||
| 394 | * attempted queued) | ||
| 395 | */ | ||
| 396 | int slow_work_enqueue(struct slow_work *work) | ||
| 397 | { | ||
| 398 | wait_queue_head_t *wfo_wq; | ||
| 399 | struct list_head *queue; | ||
| 400 | unsigned long flags; | ||
| 401 | int ret; | ||
| 402 | |||
| 403 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
| 404 | return -ECANCELED; | ||
| 405 | |||
| 406 | BUG_ON(slow_work_user_count <= 0); | ||
| 407 | BUG_ON(!work); | ||
| 408 | BUG_ON(!work->ops); | ||
| 409 | |||
| 410 | /* when honouring an enqueue request, we only promise that we will run | ||
| 411 | * the work function in the future; we do not promise to run it once | ||
| 412 | * per enqueue request | ||
| 413 | * | ||
| 414 | * we use the PENDING bit to merge together repeat requests without | ||
| 415 | * having to disable IRQs and take the spinlock, whilst still | ||
| 416 | * maintaining our promise | ||
| 417 | */ | ||
| 418 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
| 419 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
| 420 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
| 421 | queue = &vslow_work_queue; | ||
| 422 | } else { | ||
| 423 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
| 424 | queue = &slow_work_queue; | ||
| 425 | } | ||
| 426 | |||
| 427 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
| 428 | |||
| 429 | if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) | ||
| 430 | goto cancelled; | ||
| 431 | |||
| 432 | /* we promise that we will not attempt to execute the work | ||
| 433 | * function in more than one thread simultaneously | ||
| 434 | * | ||
| 435 | * this, however, leaves us with a problem if we're asked to | ||
| 436 | * enqueue the work whilst someone is executing the work | ||
| 437 | * function as simply queueing the work immediately means that | ||
| 438 | * another thread may try executing it whilst it is already | ||
| 439 | * under execution | ||
| 440 | * | ||
| 441 | * to deal with this, we set the ENQ_DEFERRED bit instead of | ||
| 442 | * enqueueing, and the thread currently executing the work | ||
| 443 | * function will enqueue the work item when the work function | ||
| 444 | * returns and it has cleared the EXECUTING bit | ||
| 445 | */ | ||
| 446 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
| 447 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
| 448 | } else { | ||
| 449 | ret = slow_work_get_ref(work); | ||
| 450 | if (ret < 0) | ||
| 451 | goto failed; | ||
| 452 | slow_work_mark_time(work); | ||
| 453 | list_add_tail(&work->link, queue); | ||
| 454 | wake_up(&slow_work_thread_wq); | ||
| 455 | |||
| 456 | /* if someone who could be requeued is sleeping on a | ||
| 457 | * thread, then ask them to yield their thread */ | ||
| 458 | if (work->link.prev == queue) | ||
| 459 | wake_up(wfo_wq); | ||
| 460 | } | ||
| 461 | |||
| 462 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 463 | } | ||
| 464 | return 0; | ||
| 465 | |||
| 466 | cancelled: | ||
| 467 | ret = -ECANCELED; | ||
| 468 | failed: | ||
| 469 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 470 | return ret; | ||
| 471 | } | ||
| 472 | EXPORT_SYMBOL(slow_work_enqueue); | ||
| 473 | |||
| 474 | static int slow_work_wait(void *word) | ||
| 475 | { | ||
| 476 | schedule(); | ||
| 477 | return 0; | ||
| 478 | } | ||
| 479 | |||
| 480 | /** | ||
| 481 | * slow_work_cancel - Cancel a slow work item | ||
| 482 | * @work: The work item to cancel | ||
| 483 | * | ||
| 484 | * This function will cancel a previously enqueued work item. If we cannot | ||
| 485 | * cancel the work item, it is guarenteed to have run when this function | ||
| 486 | * returns. | ||
| 487 | */ | ||
| 488 | void slow_work_cancel(struct slow_work *work) | ||
| 489 | { | ||
| 490 | bool wait = true, put = false; | ||
| 491 | |||
| 492 | set_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
| 493 | smp_mb(); | ||
| 494 | |||
| 495 | /* if the work item is a delayed work item with an active timer, we | ||
| 496 | * need to wait for the timer to finish _before_ getting the spinlock, | ||
| 497 | * lest we deadlock against the timer routine | ||
| 498 | * | ||
| 499 | * the timer routine will leave DELAYED set if it notices the | ||
| 500 | * CANCELLING flag in time | ||
| 501 | */ | ||
| 502 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
| 503 | struct delayed_slow_work *dwork = | ||
| 504 | container_of(work, struct delayed_slow_work, work); | ||
| 505 | del_timer_sync(&dwork->timer); | ||
| 506 | } | ||
| 507 | |||
| 508 | spin_lock_irq(&slow_work_queue_lock); | ||
| 509 | |||
| 510 | if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { | ||
| 511 | /* the timer routine aborted or never happened, so we are left | ||
| 512 | * holding the timer's reference on the item and should just | ||
| 513 | * drop the pending flag and wait for any ongoing execution to | ||
| 514 | * finish */ | ||
| 515 | struct delayed_slow_work *dwork = | ||
| 516 | container_of(work, struct delayed_slow_work, work); | ||
| 517 | |||
| 518 | BUG_ON(timer_pending(&dwork->timer)); | ||
| 519 | BUG_ON(!list_empty(&work->link)); | ||
| 520 | |||
| 521 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
| 522 | put = true; | ||
| 523 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
| 524 | |||
| 525 | } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && | ||
| 526 | !list_empty(&work->link)) { | ||
| 527 | /* the link in the pending queue holds a reference on the item | ||
| 528 | * that we will need to release */ | ||
| 529 | list_del_init(&work->link); | ||
| 530 | wait = false; | ||
| 531 | put = true; | ||
| 532 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
| 533 | |||
| 534 | } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { | ||
| 535 | /* the executor is holding our only reference on the item, so | ||
| 536 | * we merely need to wait for it to finish executing */ | ||
| 537 | clear_bit(SLOW_WORK_PENDING, &work->flags); | ||
| 538 | } | ||
| 539 | |||
| 540 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 541 | |||
| 542 | /* the EXECUTING flag is set by the executor whilst the spinlock is set | ||
| 543 | * and before the item is dequeued - so assuming the above doesn't | ||
| 544 | * actually dequeue it, simply waiting for the EXECUTING flag to be | ||
| 545 | * released here should be sufficient */ | ||
| 546 | if (wait) | ||
| 547 | wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, | ||
| 548 | TASK_UNINTERRUPTIBLE); | ||
| 549 | |||
| 550 | clear_bit(SLOW_WORK_CANCELLING, &work->flags); | ||
| 551 | if (put) | ||
| 552 | slow_work_put_ref(work); | ||
| 553 | } | ||
| 554 | EXPORT_SYMBOL(slow_work_cancel); | ||
| 555 | |||
| 556 | /* | ||
| 557 | * Handle expiry of the delay timer, indicating that a delayed slow work item | ||
| 558 | * should now be queued if not cancelled | ||
| 559 | */ | ||
| 560 | static void delayed_slow_work_timer(unsigned long data) | ||
| 561 | { | ||
| 562 | wait_queue_head_t *wfo_wq; | ||
| 563 | struct list_head *queue; | ||
| 564 | struct slow_work *work = (struct slow_work *) data; | ||
| 565 | unsigned long flags; | ||
| 566 | bool queued = false, put = false, first = false; | ||
| 567 | |||
| 568 | if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { | ||
| 569 | wfo_wq = &vslow_work_queue_waits_for_occupation; | ||
| 570 | queue = &vslow_work_queue; | ||
| 571 | } else { | ||
| 572 | wfo_wq = &slow_work_queue_waits_for_occupation; | ||
| 573 | queue = &slow_work_queue; | ||
| 574 | } | ||
| 575 | |||
| 576 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
| 577 | if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { | ||
| 578 | clear_bit(SLOW_WORK_DELAYED, &work->flags); | ||
| 579 | |||
| 580 | if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { | ||
| 581 | /* we discard the reference the timer was holding in | ||
| 582 | * favour of the one the executor holds */ | ||
| 583 | set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); | ||
| 584 | put = true; | ||
| 585 | } else { | ||
| 586 | slow_work_mark_time(work); | ||
| 587 | list_add_tail(&work->link, queue); | ||
| 588 | queued = true; | ||
| 589 | if (work->link.prev == queue) | ||
| 590 | first = true; | ||
| 591 | } | ||
| 592 | } | ||
| 593 | |||
| 594 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 595 | if (put) | ||
| 596 | slow_work_put_ref(work); | ||
| 597 | if (first) | ||
| 598 | wake_up(wfo_wq); | ||
| 599 | if (queued) | ||
| 600 | wake_up(&slow_work_thread_wq); | ||
| 601 | } | ||
| 602 | |||
| 603 | /** | ||
| 604 | * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing | ||
| 605 | * @dwork: The delayed work item to queue | ||
| 606 | * @delay: When to start executing the work, in jiffies from now | ||
| 607 | * | ||
| 608 | * This is similar to slow_work_enqueue(), but it adds a delay before the work | ||
| 609 | * is actually queued for processing. | ||
| 610 | * | ||
| 611 | * The item can have delayed processing requested on it whilst it is being | ||
| 612 | * executed. The delay will begin immediately, and if it expires before the | ||
| 613 | * item finishes executing, the item will be placed back on the queue when it | ||
| 614 | * has done executing. | ||
| 615 | */ | ||
| 616 | int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, | ||
| 617 | unsigned long delay) | ||
| 618 | { | ||
| 619 | struct slow_work *work = &dwork->work; | ||
| 620 | unsigned long flags; | ||
| 621 | int ret; | ||
| 622 | |||
| 623 | if (delay == 0) | ||
| 624 | return slow_work_enqueue(&dwork->work); | ||
| 625 | |||
| 626 | BUG_ON(slow_work_user_count <= 0); | ||
| 627 | BUG_ON(!work); | ||
| 628 | BUG_ON(!work->ops); | ||
| 629 | |||
| 630 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
| 631 | return -ECANCELED; | ||
| 632 | |||
| 633 | if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { | ||
| 634 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
| 635 | |||
| 636 | if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) | ||
| 637 | goto cancelled; | ||
| 638 | |||
| 639 | /* the timer holds a reference whilst it is pending */ | ||
| 640 | ret = slow_work_get_ref(work); | ||
| 641 | if (ret < 0) | ||
| 642 | goto cant_get_ref; | ||
| 643 | |||
| 644 | if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) | ||
| 645 | BUG(); | ||
| 646 | dwork->timer.expires = jiffies + delay; | ||
| 647 | dwork->timer.data = (unsigned long) work; | ||
| 648 | dwork->timer.function = delayed_slow_work_timer; | ||
| 649 | add_timer(&dwork->timer); | ||
| 650 | |||
| 651 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 652 | } | ||
| 653 | |||
| 654 | return 0; | ||
| 655 | |||
| 656 | cancelled: | ||
| 657 | ret = -ECANCELED; | ||
| 658 | cant_get_ref: | ||
| 659 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 660 | return ret; | ||
| 661 | } | ||
| 662 | EXPORT_SYMBOL(delayed_slow_work_enqueue); | ||
| 663 | |||
| 664 | /* | ||
| 665 | * Schedule a cull of the thread pool at some time in the near future | ||
| 666 | */ | ||
| 667 | static void slow_work_schedule_cull(void) | ||
| 668 | { | ||
| 669 | mod_timer(&slow_work_cull_timer, | ||
| 670 | round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); | ||
| 671 | } | ||
| 672 | |||
| 673 | /* | ||
| 674 | * Worker thread culling algorithm | ||
| 675 | */ | ||
| 676 | static bool slow_work_cull_thread(void) | ||
| 677 | { | ||
| 678 | unsigned long flags; | ||
| 679 | bool do_cull = false; | ||
| 680 | |||
| 681 | spin_lock_irqsave(&slow_work_queue_lock, flags); | ||
| 682 | |||
| 683 | if (slow_work_cull) { | ||
| 684 | slow_work_cull = false; | ||
| 685 | |||
| 686 | if (list_empty(&slow_work_queue) && | ||
| 687 | list_empty(&vslow_work_queue) && | ||
| 688 | atomic_read(&slow_work_thread_count) > | ||
| 689 | slow_work_min_threads) { | ||
| 690 | slow_work_schedule_cull(); | ||
| 691 | do_cull = true; | ||
| 692 | } | ||
| 693 | } | ||
| 694 | |||
| 695 | spin_unlock_irqrestore(&slow_work_queue_lock, flags); | ||
| 696 | return do_cull; | ||
| 697 | } | ||
| 698 | |||
| 699 | /* | ||
| 700 | * Determine if there is slow work available for dispatch | ||
| 701 | */ | ||
| 702 | static inline bool slow_work_available(int vsmax) | ||
| 703 | { | ||
| 704 | return !list_empty(&slow_work_queue) || | ||
| 705 | (!list_empty(&vslow_work_queue) && | ||
| 706 | atomic_read(&vslow_work_executing_count) < vsmax); | ||
| 707 | } | ||
| 708 | |||
| 709 | /* | ||
| 710 | * Worker thread dispatcher | ||
| 711 | */ | ||
| 712 | static int slow_work_thread(void *_data) | ||
| 713 | { | ||
| 714 | int vsmax, id; | ||
| 715 | |||
| 716 | DEFINE_WAIT(wait); | ||
| 717 | |||
| 718 | set_freezable(); | ||
| 719 | set_user_nice(current, -5); | ||
| 720 | |||
| 721 | /* allocate ourselves an ID */ | ||
| 722 | spin_lock_irq(&slow_work_queue_lock); | ||
| 723 | id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); | ||
| 724 | BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); | ||
| 725 | __set_bit(id, slow_work_ids); | ||
| 726 | slow_work_set_thread_pid(id, current->pid); | ||
| 727 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 728 | |||
| 729 | sprintf(current->comm, "kslowd%03u", id); | ||
| 730 | |||
| 731 | for (;;) { | ||
| 732 | vsmax = vslow_work_proportion; | ||
| 733 | vsmax *= atomic_read(&slow_work_thread_count); | ||
| 734 | vsmax /= 100; | ||
| 735 | |||
| 736 | prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, | ||
| 737 | TASK_INTERRUPTIBLE); | ||
| 738 | if (!freezing(current) && | ||
| 739 | !slow_work_threads_should_exit && | ||
| 740 | !slow_work_available(vsmax) && | ||
| 741 | !slow_work_cull) | ||
| 742 | schedule(); | ||
| 743 | finish_wait(&slow_work_thread_wq, &wait); | ||
| 744 | |||
| 745 | try_to_freeze(); | ||
| 746 | |||
| 747 | vsmax = vslow_work_proportion; | ||
| 748 | vsmax *= atomic_read(&slow_work_thread_count); | ||
| 749 | vsmax /= 100; | ||
| 750 | |||
| 751 | if (slow_work_available(vsmax) && slow_work_execute(id)) { | ||
| 752 | cond_resched(); | ||
| 753 | if (list_empty(&slow_work_queue) && | ||
| 754 | list_empty(&vslow_work_queue) && | ||
| 755 | atomic_read(&slow_work_thread_count) > | ||
| 756 | slow_work_min_threads) | ||
| 757 | slow_work_schedule_cull(); | ||
| 758 | continue; | ||
| 759 | } | ||
| 760 | |||
| 761 | if (slow_work_threads_should_exit) | ||
| 762 | break; | ||
| 763 | |||
| 764 | if (slow_work_cull && slow_work_cull_thread()) | ||
| 765 | break; | ||
| 766 | } | ||
| 767 | |||
| 768 | spin_lock_irq(&slow_work_queue_lock); | ||
| 769 | slow_work_set_thread_pid(id, 0); | ||
| 770 | __clear_bit(id, slow_work_ids); | ||
| 771 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 772 | |||
| 773 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
| 774 | complete_and_exit(&slow_work_last_thread_exited, 0); | ||
| 775 | return 0; | ||
| 776 | } | ||
| 777 | |||
| 778 | /* | ||
| 779 | * Handle thread cull timer expiration | ||
| 780 | */ | ||
| 781 | static void slow_work_cull_timeout(unsigned long data) | ||
| 782 | { | ||
| 783 | slow_work_cull = true; | ||
| 784 | wake_up(&slow_work_thread_wq); | ||
| 785 | } | ||
| 786 | |||
| 787 | /* | ||
| 788 | * Start a new slow work thread | ||
| 789 | */ | ||
| 790 | static void slow_work_new_thread_execute(struct slow_work *work) | ||
| 791 | { | ||
| 792 | struct task_struct *p; | ||
| 793 | |||
| 794 | if (slow_work_threads_should_exit) | ||
| 795 | return; | ||
| 796 | |||
| 797 | if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) | ||
| 798 | return; | ||
| 799 | |||
| 800 | if (!mutex_trylock(&slow_work_user_lock)) | ||
| 801 | return; | ||
| 802 | |||
| 803 | slow_work_may_not_start_new_thread = true; | ||
| 804 | atomic_inc(&slow_work_thread_count); | ||
| 805 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
| 806 | if (IS_ERR(p)) { | ||
| 807 | printk(KERN_DEBUG "Slow work thread pool: OOM\n"); | ||
| 808 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
| 809 | BUG(); /* we're running on a slow work thread... */ | ||
| 810 | mod_timer(&slow_work_oom_timer, | ||
| 811 | round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); | ||
| 812 | } else { | ||
| 813 | /* ratelimit the starting of new threads */ | ||
| 814 | mod_timer(&slow_work_oom_timer, jiffies + 1); | ||
| 815 | } | ||
| 816 | |||
| 817 | mutex_unlock(&slow_work_user_lock); | ||
| 818 | } | ||
| 819 | |||
| 820 | static const struct slow_work_ops slow_work_new_thread_ops = { | ||
| 821 | .owner = THIS_MODULE, | ||
| 822 | .execute = slow_work_new_thread_execute, | ||
| 823 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 824 | .desc = slow_work_new_thread_desc, | ||
| 825 | #endif | ||
| 826 | }; | ||
| 827 | |||
| 828 | /* | ||
| 829 | * post-OOM new thread start suppression expiration | ||
| 830 | */ | ||
| 831 | static void slow_work_oom_timeout(unsigned long data) | ||
| 832 | { | ||
| 833 | slow_work_may_not_start_new_thread = false; | ||
| 834 | } | ||
| 835 | |||
| 836 | #ifdef CONFIG_SYSCTL | ||
| 837 | /* | ||
| 838 | * Handle adjustment of the minimum number of threads | ||
| 839 | */ | ||
| 840 | static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, | ||
| 841 | void __user *buffer, | ||
| 842 | size_t *lenp, loff_t *ppos) | ||
| 843 | { | ||
| 844 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 845 | int n; | ||
| 846 | |||
| 847 | if (ret == 0) { | ||
| 848 | mutex_lock(&slow_work_user_lock); | ||
| 849 | if (slow_work_user_count > 0) { | ||
| 850 | /* see if we need to start or stop threads */ | ||
| 851 | n = atomic_read(&slow_work_thread_count) - | ||
| 852 | slow_work_min_threads; | ||
| 853 | |||
| 854 | if (n < 0 && !slow_work_may_not_start_new_thread) | ||
| 855 | slow_work_enqueue(&slow_work_new_thread); | ||
| 856 | else if (n > 0) | ||
| 857 | slow_work_schedule_cull(); | ||
| 858 | } | ||
| 859 | mutex_unlock(&slow_work_user_lock); | ||
| 860 | } | ||
| 861 | |||
| 862 | return ret; | ||
| 863 | } | ||
| 864 | |||
| 865 | /* | ||
| 866 | * Handle adjustment of the maximum number of threads | ||
| 867 | */ | ||
| 868 | static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, | ||
| 869 | void __user *buffer, | ||
| 870 | size_t *lenp, loff_t *ppos) | ||
| 871 | { | ||
| 872 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 873 | int n; | ||
| 874 | |||
| 875 | if (ret == 0) { | ||
| 876 | mutex_lock(&slow_work_user_lock); | ||
| 877 | if (slow_work_user_count > 0) { | ||
| 878 | /* see if we need to stop threads */ | ||
| 879 | n = slow_work_max_threads - | ||
| 880 | atomic_read(&slow_work_thread_count); | ||
| 881 | |||
| 882 | if (n < 0) | ||
| 883 | slow_work_schedule_cull(); | ||
| 884 | } | ||
| 885 | mutex_unlock(&slow_work_user_lock); | ||
| 886 | } | ||
| 887 | |||
| 888 | return ret; | ||
| 889 | } | ||
| 890 | #endif /* CONFIG_SYSCTL */ | ||
| 891 | |||
| 892 | /** | ||
| 893 | * slow_work_register_user - Register a user of the facility | ||
| 894 | * @module: The module about to make use of the facility | ||
| 895 | * | ||
| 896 | * Register a user of the facility, starting up the initial threads if there | ||
| 897 | * aren't any other users at this point. This will return 0 if successful, or | ||
| 898 | * an error if not. | ||
| 899 | */ | ||
| 900 | int slow_work_register_user(struct module *module) | ||
| 901 | { | ||
| 902 | struct task_struct *p; | ||
| 903 | int loop; | ||
| 904 | |||
| 905 | mutex_lock(&slow_work_user_lock); | ||
| 906 | |||
| 907 | if (slow_work_user_count == 0) { | ||
| 908 | printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); | ||
| 909 | init_completion(&slow_work_last_thread_exited); | ||
| 910 | |||
| 911 | slow_work_threads_should_exit = false; | ||
| 912 | slow_work_init(&slow_work_new_thread, | ||
| 913 | &slow_work_new_thread_ops); | ||
| 914 | slow_work_may_not_start_new_thread = false; | ||
| 915 | slow_work_cull = false; | ||
| 916 | |||
| 917 | /* start the minimum number of threads */ | ||
| 918 | for (loop = 0; loop < slow_work_min_threads; loop++) { | ||
| 919 | atomic_inc(&slow_work_thread_count); | ||
| 920 | p = kthread_run(slow_work_thread, NULL, "kslowd"); | ||
| 921 | if (IS_ERR(p)) | ||
| 922 | goto error; | ||
| 923 | } | ||
| 924 | printk(KERN_NOTICE "Slow work thread pool: Ready\n"); | ||
| 925 | } | ||
| 926 | |||
| 927 | slow_work_user_count++; | ||
| 928 | mutex_unlock(&slow_work_user_lock); | ||
| 929 | return 0; | ||
| 930 | |||
| 931 | error: | ||
| 932 | if (atomic_dec_and_test(&slow_work_thread_count)) | ||
| 933 | complete(&slow_work_last_thread_exited); | ||
| 934 | if (loop > 0) { | ||
| 935 | printk(KERN_ERR "Slow work thread pool:" | ||
| 936 | " Aborting startup on ENOMEM\n"); | ||
| 937 | slow_work_threads_should_exit = true; | ||
| 938 | wake_up_all(&slow_work_thread_wq); | ||
| 939 | wait_for_completion(&slow_work_last_thread_exited); | ||
| 940 | printk(KERN_ERR "Slow work thread pool: Aborted\n"); | ||
| 941 | } | ||
| 942 | mutex_unlock(&slow_work_user_lock); | ||
| 943 | return PTR_ERR(p); | ||
| 944 | } | ||
| 945 | EXPORT_SYMBOL(slow_work_register_user); | ||
| 946 | |||
| 947 | /* | ||
| 948 | * wait for all outstanding items from the calling module to complete | ||
| 949 | * - note that more items may be queued whilst we're waiting | ||
| 950 | */ | ||
| 951 | static void slow_work_wait_for_items(struct module *module) | ||
| 952 | { | ||
| 953 | #ifdef CONFIG_MODULES | ||
| 954 | DECLARE_WAITQUEUE(myself, current); | ||
| 955 | struct slow_work *work; | ||
| 956 | int loop; | ||
| 957 | |||
| 958 | mutex_lock(&slow_work_unreg_sync_lock); | ||
| 959 | add_wait_queue(&slow_work_unreg_wq, &myself); | ||
| 960 | |||
| 961 | for (;;) { | ||
| 962 | spin_lock_irq(&slow_work_queue_lock); | ||
| 963 | |||
| 964 | /* first of all, we wait for the last queued item in each list | ||
| 965 | * to be processed */ | ||
| 966 | list_for_each_entry_reverse(work, &vslow_work_queue, link) { | ||
| 967 | if (work->owner == module) { | ||
| 968 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 969 | slow_work_unreg_work_item = work; | ||
| 970 | goto do_wait; | ||
| 971 | } | ||
| 972 | } | ||
| 973 | list_for_each_entry_reverse(work, &slow_work_queue, link) { | ||
| 974 | if (work->owner == module) { | ||
| 975 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 976 | slow_work_unreg_work_item = work; | ||
| 977 | goto do_wait; | ||
| 978 | } | ||
| 979 | } | ||
| 980 | |||
| 981 | /* then we wait for the items being processed to finish */ | ||
| 982 | slow_work_unreg_module = module; | ||
| 983 | smp_mb(); | ||
| 984 | for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { | ||
| 985 | if (slow_work_thread_processing[loop] == module) | ||
| 986 | goto do_wait; | ||
| 987 | } | ||
| 988 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 989 | break; /* okay, we're done */ | ||
| 990 | |||
| 991 | do_wait: | ||
| 992 | spin_unlock_irq(&slow_work_queue_lock); | ||
| 993 | schedule(); | ||
| 994 | slow_work_unreg_work_item = NULL; | ||
| 995 | slow_work_unreg_module = NULL; | ||
| 996 | } | ||
| 997 | |||
| 998 | remove_wait_queue(&slow_work_unreg_wq, &myself); | ||
| 999 | mutex_unlock(&slow_work_unreg_sync_lock); | ||
| 1000 | #endif /* CONFIG_MODULES */ | ||
| 1001 | } | ||
| 1002 | |||
| 1003 | /** | ||
| 1004 | * slow_work_unregister_user - Unregister a user of the facility | ||
| 1005 | * @module: The module whose items should be cleared | ||
| 1006 | * | ||
| 1007 | * Unregister a user of the facility, killing all the threads if this was the | ||
| 1008 | * last one. | ||
| 1009 | * | ||
| 1010 | * This waits for all the work items belonging to the nominated module to go | ||
| 1011 | * away before proceeding. | ||
| 1012 | */ | ||
| 1013 | void slow_work_unregister_user(struct module *module) | ||
| 1014 | { | ||
| 1015 | /* first of all, wait for all outstanding items from the calling module | ||
| 1016 | * to complete */ | ||
| 1017 | if (module) | ||
| 1018 | slow_work_wait_for_items(module); | ||
| 1019 | |||
| 1020 | /* then we can actually go about shutting down the facility if need | ||
| 1021 | * be */ | ||
| 1022 | mutex_lock(&slow_work_user_lock); | ||
| 1023 | |||
| 1024 | BUG_ON(slow_work_user_count <= 0); | ||
| 1025 | |||
| 1026 | slow_work_user_count--; | ||
| 1027 | if (slow_work_user_count == 0) { | ||
| 1028 | printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); | ||
| 1029 | slow_work_threads_should_exit = true; | ||
| 1030 | del_timer_sync(&slow_work_cull_timer); | ||
| 1031 | del_timer_sync(&slow_work_oom_timer); | ||
| 1032 | wake_up_all(&slow_work_thread_wq); | ||
| 1033 | wait_for_completion(&slow_work_last_thread_exited); | ||
| 1034 | printk(KERN_NOTICE "Slow work thread pool:" | ||
| 1035 | " Shut down complete\n"); | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | mutex_unlock(&slow_work_user_lock); | ||
| 1039 | } | ||
| 1040 | EXPORT_SYMBOL(slow_work_unregister_user); | ||
| 1041 | |||
| 1042 | /* | ||
| 1043 | * Initialise the slow work facility | ||
| 1044 | */ | ||
| 1045 | static int __init init_slow_work(void) | ||
| 1046 | { | ||
| 1047 | unsigned nr_cpus = num_possible_cpus(); | ||
| 1048 | |||
| 1049 | if (slow_work_max_threads < nr_cpus) | ||
| 1050 | slow_work_max_threads = nr_cpus; | ||
| 1051 | #ifdef CONFIG_SYSCTL | ||
| 1052 | if (slow_work_max_max_threads < nr_cpus * 2) | ||
| 1053 | slow_work_max_max_threads = nr_cpus * 2; | ||
| 1054 | #endif | ||
| 1055 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 1056 | { | ||
| 1057 | struct dentry *dbdir; | ||
| 1058 | |||
| 1059 | dbdir = debugfs_create_dir("slow_work", NULL); | ||
| 1060 | if (dbdir && !IS_ERR(dbdir)) | ||
| 1061 | debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, | ||
| 1062 | NULL, &slow_work_runqueue_fops); | ||
| 1063 | } | ||
| 1064 | #endif | ||
| 1065 | return 0; | ||
| 1066 | } | ||
| 1067 | |||
| 1068 | subsys_initcall(init_slow_work); | ||
diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41d..000000000000 --- a/kernel/slow-work.h +++ /dev/null | |||
| @@ -1,72 +0,0 @@ | |||
| 1 | /* Slow work private definitions | ||
| 2 | * | ||
| 3 | * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. | ||
| 4 | * Written by David Howells (dhowells@redhat.com) | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of | ||
| 13 | * things to do */ | ||
| 14 | #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after | ||
| 15 | * OOM */ | ||
| 16 | |||
| 17 | #define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ | ||
| 18 | |||
| 19 | /* | ||
| 20 | * slow-work.c | ||
| 21 | */ | ||
| 22 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 23 | extern struct slow_work *slow_work_execs[]; | ||
| 24 | extern pid_t slow_work_pids[]; | ||
| 25 | extern rwlock_t slow_work_execs_lock; | ||
| 26 | #endif | ||
| 27 | |||
| 28 | extern struct list_head slow_work_queue; | ||
| 29 | extern struct list_head vslow_work_queue; | ||
| 30 | extern spinlock_t slow_work_queue_lock; | ||
| 31 | |||
| 32 | /* | ||
| 33 | * slow-work-debugfs.c | ||
| 34 | */ | ||
| 35 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 36 | extern const struct file_operations slow_work_runqueue_fops; | ||
| 37 | |||
| 38 | extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); | ||
| 39 | #endif | ||
| 40 | |||
| 41 | /* | ||
| 42 | * Helper functions | ||
| 43 | */ | ||
| 44 | static inline void slow_work_set_thread_pid(int id, pid_t pid) | ||
| 45 | { | ||
| 46 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 47 | slow_work_pids[id] = pid; | ||
| 48 | #endif | ||
| 49 | } | ||
| 50 | |||
| 51 | static inline void slow_work_mark_time(struct slow_work *work) | ||
| 52 | { | ||
| 53 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 54 | work->mark = CURRENT_TIME; | ||
| 55 | #endif | ||
| 56 | } | ||
| 57 | |||
| 58 | static inline void slow_work_begin_exec(int id, struct slow_work *work) | ||
| 59 | { | ||
| 60 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 61 | slow_work_execs[id] = work; | ||
| 62 | #endif | ||
| 63 | } | ||
| 64 | |||
| 65 | static inline void slow_work_end_exec(int id, struct slow_work *work) | ||
| 66 | { | ||
| 67 | #ifdef CONFIG_SLOW_WORK_DEBUG | ||
| 68 | write_lock(&slow_work_execs_lock); | ||
| 69 | slow_work_execs[id] = NULL; | ||
| 70 | write_unlock(&slow_work_execs_lock); | ||
| 71 | #endif | ||
| 72 | } | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb5..000000000000 --- a/kernel/softlockup.c +++ /dev/null | |||
| @@ -1,293 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Detect Soft Lockups | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. | ||
| 5 | * | ||
| 6 | * this code detects soft lockups: incidents in where on a CPU | ||
| 7 | * the kernel does not reschedule for 10 seconds or more. | ||
| 8 | */ | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/cpu.h> | ||
| 11 | #include <linux/nmi.h> | ||
| 12 | #include <linux/init.h> | ||
| 13 | #include <linux/delay.h> | ||
| 14 | #include <linux/freezer.h> | ||
| 15 | #include <linux/kthread.h> | ||
| 16 | #include <linux/lockdep.h> | ||
| 17 | #include <linux/notifier.h> | ||
| 18 | #include <linux/module.h> | ||
| 19 | #include <linux/sysctl.h> | ||
| 20 | |||
| 21 | #include <asm/irq_regs.h> | ||
| 22 | |||
| 23 | static DEFINE_SPINLOCK(print_lock); | ||
| 24 | |||
| 25 | static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ | ||
| 26 | static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ | ||
| 27 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
| 28 | static DEFINE_PER_CPU(bool, softlock_touch_sync); | ||
| 29 | |||
| 30 | static int __read_mostly did_panic; | ||
| 31 | int __read_mostly softlockup_thresh = 60; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * Should we panic (and reboot, if panic_timeout= is set) when a | ||
| 35 | * soft-lockup occurs: | ||
| 36 | */ | ||
| 37 | unsigned int __read_mostly softlockup_panic = | ||
| 38 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
| 39 | |||
| 40 | static int __init softlockup_panic_setup(char *str) | ||
| 41 | { | ||
| 42 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
| 43 | |||
| 44 | return 1; | ||
| 45 | } | ||
| 46 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
| 47 | |||
| 48 | static int | ||
| 49 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
| 50 | { | ||
| 51 | did_panic = 1; | ||
| 52 | |||
| 53 | return NOTIFY_DONE; | ||
| 54 | } | ||
| 55 | |||
| 56 | static struct notifier_block panic_block = { | ||
| 57 | .notifier_call = softlock_panic, | ||
| 58 | }; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * Returns seconds, approximately. We don't need nanosecond | ||
| 62 | * resolution, and we don't need to waste time with a big divide when | ||
| 63 | * 2^30ns == 1.074s. | ||
| 64 | */ | ||
| 65 | static unsigned long get_timestamp(int this_cpu) | ||
| 66 | { | ||
| 67 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
| 68 | } | ||
| 69 | |||
| 70 | static void __touch_softlockup_watchdog(void) | ||
| 71 | { | ||
| 72 | int this_cpu = raw_smp_processor_id(); | ||
| 73 | |||
| 74 | __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); | ||
| 75 | } | ||
| 76 | |||
| 77 | void touch_softlockup_watchdog(void) | ||
| 78 | { | ||
| 79 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
| 80 | } | ||
| 81 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
| 82 | |||
| 83 | void touch_softlockup_watchdog_sync(void) | ||
| 84 | { | ||
| 85 | __raw_get_cpu_var(softlock_touch_sync) = true; | ||
| 86 | __raw_get_cpu_var(softlockup_touch_ts) = 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | void touch_all_softlockup_watchdogs(void) | ||
| 90 | { | ||
| 91 | int cpu; | ||
| 92 | |||
| 93 | /* Cause each CPU to re-update its timestamp rather than complain */ | ||
| 94 | for_each_online_cpu(cpu) | ||
| 95 | per_cpu(softlockup_touch_ts, cpu) = 0; | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL(touch_all_softlockup_watchdogs); | ||
| 98 | |||
| 99 | int proc_dosoftlockup_thresh(struct ctl_table *table, int write, | ||
| 100 | void __user *buffer, | ||
| 101 | size_t *lenp, loff_t *ppos) | ||
| 102 | { | ||
| 103 | touch_all_softlockup_watchdogs(); | ||
| 104 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 105 | } | ||
| 106 | |||
| 107 | /* | ||
| 108 | * This callback runs from the timer interrupt, and checks | ||
| 109 | * whether the watchdog thread has hung or not: | ||
| 110 | */ | ||
| 111 | void softlockup_tick(void) | ||
| 112 | { | ||
| 113 | int this_cpu = smp_processor_id(); | ||
| 114 | unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); | ||
| 115 | unsigned long print_ts; | ||
| 116 | struct pt_regs *regs = get_irq_regs(); | ||
| 117 | unsigned long now; | ||
| 118 | |||
| 119 | /* Is detection switched off? */ | ||
| 120 | if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { | ||
| 121 | /* Be sure we don't false trigger if switched back on */ | ||
| 122 | if (touch_ts) | ||
| 123 | per_cpu(softlockup_touch_ts, this_cpu) = 0; | ||
| 124 | return; | ||
| 125 | } | ||
| 126 | |||
| 127 | if (touch_ts == 0) { | ||
| 128 | if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { | ||
| 129 | /* | ||
| 130 | * If the time stamp was touched atomically | ||
| 131 | * make sure the scheduler tick is up to date. | ||
| 132 | */ | ||
| 133 | per_cpu(softlock_touch_sync, this_cpu) = false; | ||
| 134 | sched_clock_tick(); | ||
| 135 | } | ||
| 136 | __touch_softlockup_watchdog(); | ||
| 137 | return; | ||
| 138 | } | ||
| 139 | |||
| 140 | print_ts = per_cpu(softlockup_print_ts, this_cpu); | ||
| 141 | |||
| 142 | /* report at most once a second */ | ||
| 143 | if (print_ts == touch_ts || did_panic) | ||
| 144 | return; | ||
| 145 | |||
| 146 | /* do not print during early bootup: */ | ||
| 147 | if (unlikely(system_state != SYSTEM_RUNNING)) { | ||
| 148 | __touch_softlockup_watchdog(); | ||
| 149 | return; | ||
| 150 | } | ||
| 151 | |||
| 152 | now = get_timestamp(this_cpu); | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Wake up the high-prio watchdog task twice per | ||
| 156 | * threshold timespan. | ||
| 157 | */ | ||
| 158 | if (time_after(now - softlockup_thresh/2, touch_ts)) | ||
| 159 | wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); | ||
| 160 | |||
| 161 | /* Warn about unreasonable delays: */ | ||
| 162 | if (time_before_eq(now - softlockup_thresh, touch_ts)) | ||
| 163 | return; | ||
| 164 | |||
| 165 | per_cpu(softlockup_print_ts, this_cpu) = touch_ts; | ||
| 166 | |||
| 167 | spin_lock(&print_lock); | ||
| 168 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", | ||
| 169 | this_cpu, now - touch_ts, | ||
| 170 | current->comm, task_pid_nr(current)); | ||
| 171 | print_modules(); | ||
| 172 | print_irqtrace_events(current); | ||
| 173 | if (regs) | ||
| 174 | show_regs(regs); | ||
| 175 | else | ||
| 176 | dump_stack(); | ||
| 177 | spin_unlock(&print_lock); | ||
| 178 | |||
| 179 | if (softlockup_panic) | ||
| 180 | panic("softlockup: hung tasks"); | ||
| 181 | } | ||
| 182 | |||
| 183 | /* | ||
| 184 | * The watchdog thread - runs every second and touches the timestamp. | ||
| 185 | */ | ||
| 186 | static int watchdog(void *__bind_cpu) | ||
| 187 | { | ||
| 188 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
| 189 | |||
| 190 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
| 191 | |||
| 192 | /* initialize timestamp */ | ||
| 193 | __touch_softlockup_watchdog(); | ||
| 194 | |||
| 195 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 196 | /* | ||
| 197 | * Run briefly once per second to reset the softlockup timestamp. | ||
| 198 | * If this gets delayed for more than 60 seconds then the | ||
| 199 | * debug-printout triggers in softlockup_tick(). | ||
| 200 | */ | ||
| 201 | while (!kthread_should_stop()) { | ||
| 202 | __touch_softlockup_watchdog(); | ||
| 203 | schedule(); | ||
| 204 | |||
| 205 | if (kthread_should_stop()) | ||
| 206 | break; | ||
| 207 | |||
| 208 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 209 | } | ||
| 210 | __set_current_state(TASK_RUNNING); | ||
| 211 | |||
| 212 | return 0; | ||
| 213 | } | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Create/destroy watchdog threads as CPUs come and go: | ||
| 217 | */ | ||
| 218 | static int __cpuinit | ||
| 219 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
| 220 | { | ||
| 221 | int hotcpu = (unsigned long)hcpu; | ||
| 222 | struct task_struct *p; | ||
| 223 | |||
| 224 | switch (action) { | ||
| 225 | case CPU_UP_PREPARE: | ||
| 226 | case CPU_UP_PREPARE_FROZEN: | ||
| 227 | BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); | ||
| 228 | p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | ||
| 229 | if (IS_ERR(p)) { | ||
| 230 | printk(KERN_ERR "watchdog for %i failed\n", hotcpu); | ||
| 231 | return NOTIFY_BAD; | ||
| 232 | } | ||
| 233 | per_cpu(softlockup_touch_ts, hotcpu) = 0; | ||
| 234 | per_cpu(softlockup_watchdog, hotcpu) = p; | ||
| 235 | kthread_bind(p, hotcpu); | ||
| 236 | break; | ||
| 237 | case CPU_ONLINE: | ||
| 238 | case CPU_ONLINE_FROZEN: | ||
| 239 | wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); | ||
| 240 | break; | ||
| 241 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 242 | case CPU_UP_CANCELED: | ||
| 243 | case CPU_UP_CANCELED_FROZEN: | ||
| 244 | if (!per_cpu(softlockup_watchdog, hotcpu)) | ||
| 245 | break; | ||
| 246 | /* Unbind so it can run. Fall thru. */ | ||
| 247 | kthread_bind(per_cpu(softlockup_watchdog, hotcpu), | ||
| 248 | cpumask_any(cpu_online_mask)); | ||
| 249 | case CPU_DEAD: | ||
| 250 | case CPU_DEAD_FROZEN: | ||
| 251 | p = per_cpu(softlockup_watchdog, hotcpu); | ||
| 252 | per_cpu(softlockup_watchdog, hotcpu) = NULL; | ||
| 253 | kthread_stop(p); | ||
| 254 | break; | ||
| 255 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 256 | } | ||
| 257 | return NOTIFY_OK; | ||
| 258 | } | ||
| 259 | |||
| 260 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
| 261 | .notifier_call = cpu_callback | ||
| 262 | }; | ||
| 263 | |||
| 264 | static int __initdata nosoftlockup; | ||
| 265 | |||
| 266 | static int __init nosoftlockup_setup(char *str) | ||
| 267 | { | ||
| 268 | nosoftlockup = 1; | ||
| 269 | return 1; | ||
| 270 | } | ||
| 271 | __setup("nosoftlockup", nosoftlockup_setup); | ||
| 272 | |||
| 273 | static int __init spawn_softlockup_task(void) | ||
| 274 | { | ||
| 275 | void *cpu = (void *)(long)smp_processor_id(); | ||
| 276 | int err; | ||
| 277 | |||
| 278 | if (nosoftlockup) | ||
| 279 | return 0; | ||
| 280 | |||
| 281 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
| 282 | if (err == NOTIFY_BAD) { | ||
| 283 | BUG(); | ||
| 284 | return 1; | ||
| 285 | } | ||
| 286 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
| 287 | register_cpu_notifier(&cpu_nfb); | ||
| 288 | |||
| 289 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
| 290 | |||
| 291 | return 0; | ||
| 292 | } | ||
| 293 | early_initcall(spawn_softlockup_task); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761f4876..6d850bf0a517 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -50,7 +50,6 @@ | |||
| 50 | #include <linux/acpi.h> | 50 | #include <linux/acpi.h> |
| 51 | #include <linux/reboot.h> | 51 | #include <linux/reboot.h> |
| 52 | #include <linux/ftrace.h> | 52 | #include <linux/ftrace.h> |
| 53 | #include <linux/slow-work.h> | ||
| 54 | #include <linux/perf_event.h> | 53 | #include <linux/perf_event.h> |
| 55 | #include <linux/kprobes.h> | 54 | #include <linux/kprobes.h> |
| 56 | #include <linux/pipe_fs_i.h> | 55 | #include <linux/pipe_fs_i.h> |
| @@ -76,6 +75,10 @@ | |||
| 76 | #include <scsi/sg.h> | 75 | #include <scsi/sg.h> |
| 77 | #endif | 76 | #endif |
| 78 | 77 | ||
| 78 | #ifdef CONFIG_LOCKUP_DETECTOR | ||
| 79 | #include <linux/nmi.h> | ||
| 80 | #endif | ||
| 81 | |||
| 79 | 82 | ||
| 80 | #if defined(CONFIG_SYSCTL) | 83 | #if defined(CONFIG_SYSCTL) |
| 81 | 84 | ||
| @@ -106,7 +109,7 @@ extern int blk_iopoll_enabled; | |||
| 106 | #endif | 109 | #endif |
| 107 | 110 | ||
| 108 | /* Constants used for minimum and maximum */ | 111 | /* Constants used for minimum and maximum */ |
| 109 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 112 | #ifdef CONFIG_LOCKUP_DETECTOR |
| 110 | static int sixty = 60; | 113 | static int sixty = 60; |
| 111 | static int neg_one = -1; | 114 | static int neg_one = -1; |
| 112 | #endif | 115 | #endif |
| @@ -562,7 +565,7 @@ static struct ctl_table kern_table[] = { | |||
| 562 | .extra2 = &one, | 565 | .extra2 = &one, |
| 563 | }, | 566 | }, |
| 564 | #endif | 567 | #endif |
| 565 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 568 | #ifdef CONFIG_HOTPLUG |
| 566 | { | 569 | { |
| 567 | .procname = "hotplug", | 570 | .procname = "hotplug", |
| 568 | .data = &uevent_helper, | 571 | .data = &uevent_helper, |
| @@ -710,7 +713,34 @@ static struct ctl_table kern_table[] = { | |||
| 710 | .mode = 0444, | 713 | .mode = 0444, |
| 711 | .proc_handler = proc_dointvec, | 714 | .proc_handler = proc_dointvec, |
| 712 | }, | 715 | }, |
| 713 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 716 | #if defined(CONFIG_LOCKUP_DETECTOR) |
| 717 | { | ||
| 718 | .procname = "watchdog", | ||
| 719 | .data = &watchdog_enabled, | ||
| 720 | .maxlen = sizeof (int), | ||
| 721 | .mode = 0644, | ||
| 722 | .proc_handler = proc_dowatchdog_enabled, | ||
| 723 | }, | ||
| 724 | { | ||
| 725 | .procname = "watchdog_thresh", | ||
| 726 | .data = &softlockup_thresh, | ||
| 727 | .maxlen = sizeof(int), | ||
| 728 | .mode = 0644, | ||
| 729 | .proc_handler = proc_dowatchdog_thresh, | ||
| 730 | .extra1 = &neg_one, | ||
| 731 | .extra2 = &sixty, | ||
| 732 | }, | ||
| 733 | { | ||
| 734 | .procname = "softlockup_panic", | ||
| 735 | .data = &softlockup_panic, | ||
| 736 | .maxlen = sizeof(int), | ||
| 737 | .mode = 0644, | ||
| 738 | .proc_handler = proc_dointvec_minmax, | ||
| 739 | .extra1 = &zero, | ||
| 740 | .extra2 = &one, | ||
| 741 | }, | ||
| 742 | #endif | ||
| 743 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) | ||
| 714 | { | 744 | { |
| 715 | .procname = "unknown_nmi_panic", | 745 | .procname = "unknown_nmi_panic", |
| 716 | .data = &unknown_nmi_panic, | 746 | .data = &unknown_nmi_panic, |
| @@ -813,26 +843,6 @@ static struct ctl_table kern_table[] = { | |||
| 813 | .proc_handler = proc_dointvec, | 843 | .proc_handler = proc_dointvec, |
| 814 | }, | 844 | }, |
| 815 | #endif | 845 | #endif |
| 816 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
| 817 | { | ||
| 818 | .procname = "softlockup_panic", | ||
| 819 | .data = &softlockup_panic, | ||
| 820 | .maxlen = sizeof(int), | ||
| 821 | .mode = 0644, | ||
| 822 | .proc_handler = proc_dointvec_minmax, | ||
| 823 | .extra1 = &zero, | ||
| 824 | .extra2 = &one, | ||
| 825 | }, | ||
| 826 | { | ||
| 827 | .procname = "softlockup_thresh", | ||
| 828 | .data = &softlockup_thresh, | ||
| 829 | .maxlen = sizeof(int), | ||
| 830 | .mode = 0644, | ||
| 831 | .proc_handler = proc_dosoftlockup_thresh, | ||
| 832 | .extra1 = &neg_one, | ||
| 833 | .extra2 = &sixty, | ||
| 834 | }, | ||
| 835 | #endif | ||
| 836 | #ifdef CONFIG_DETECT_HUNG_TASK | 846 | #ifdef CONFIG_DETECT_HUNG_TASK |
| 837 | { | 847 | { |
| 838 | .procname = "hung_task_panic", | 848 | .procname = "hung_task_panic", |
| @@ -906,13 +916,6 @@ static struct ctl_table kern_table[] = { | |||
| 906 | .proc_handler = proc_dointvec, | 916 | .proc_handler = proc_dointvec, |
| 907 | }, | 917 | }, |
| 908 | #endif | 918 | #endif |
| 909 | #ifdef CONFIG_SLOW_WORK | ||
| 910 | { | ||
| 911 | .procname = "slow-work", | ||
| 912 | .mode = 0555, | ||
| 913 | .child = slow_work_sysctls, | ||
| 914 | }, | ||
| 915 | #endif | ||
| 916 | #ifdef CONFIG_PERF_EVENTS | 919 | #ifdef CONFIG_PERF_EVENTS |
| 917 | { | 920 | { |
| 918 | .procname = "perf_event_paranoid", | 921 | .procname = "perf_event_paranoid", |
diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09a..ba9b338d1835 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) | |||
| 300 | } | 300 | } |
| 301 | EXPORT_SYMBOL(timespec_trunc); | 301 | EXPORT_SYMBOL(timespec_trunc); |
| 302 | 302 | ||
| 303 | #ifndef CONFIG_GENERIC_TIME | ||
| 304 | /* | ||
| 305 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | ||
| 306 | * and therefore only yields usec accuracy | ||
| 307 | */ | ||
| 308 | void getnstimeofday(struct timespec *tv) | ||
| 309 | { | ||
| 310 | struct timeval x; | ||
| 311 | |||
| 312 | do_gettimeofday(&x); | ||
| 313 | tv->tv_sec = x.tv_sec; | ||
| 314 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | ||
| 315 | } | ||
| 316 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
| 317 | #endif | ||
| 318 | |||
| 319 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 303 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
| 320 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 304 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
| 321 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | 305 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0a..f06a8a365648 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -6,7 +6,7 @@ config TICK_ONESHOT | |||
| 6 | 6 | ||
| 7 | config NO_HZ | 7 | config NO_HZ |
| 8 | bool "Tickless System (Dynamic Ticks)" | 8 | bool "Tickless System (Dynamic Ticks)" |
| 9 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 9 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
| 10 | select TICK_ONESHOT | 10 | select TICK_ONESHOT |
| 11 | help | 11 | help |
| 12 | This option enables a tickless system: timer interrupts will | 12 | This option enables a tickless system: timer interrupts will |
| @@ -15,7 +15,7 @@ config NO_HZ | |||
| 15 | 15 | ||
| 16 | config HIGH_RES_TIMERS | 16 | config HIGH_RES_TIMERS |
| 17 | bool "High Resolution Timer Support" | 17 | bool "High Resolution Timer Support" |
| 18 | depends on GENERIC_TIME && GENERIC_CLOCKEVENTS | 18 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
| 19 | select TICK_ONESHOT | 19 | select TICK_ONESHOT |
| 20 | help | 20 | help |
| 21 | This option enables high resolution timer support. If your | 21 | This option enables high resolution timer support. If your |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d561..c18d7efa1b4b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
| 531 | return max_nsecs - (max_nsecs >> 5); | 531 | return max_nsecs - (max_nsecs >> 5); |
| 532 | } | 532 | } |
| 533 | 533 | ||
| 534 | #ifdef CONFIG_GENERIC_TIME | 534 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
| 535 | 535 | ||
| 536 | /** | 536 | /** |
| 537 | * clocksource_select - Select the best clocksource available | 537 | * clocksource_select - Select the best clocksource available |
| @@ -577,7 +577,7 @@ static void clocksource_select(void) | |||
| 577 | } | 577 | } |
| 578 | } | 578 | } |
| 579 | 579 | ||
| 580 | #else /* CONFIG_GENERIC_TIME */ | 580 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ |
| 581 | 581 | ||
| 582 | static inline void clocksource_select(void) { } | 582 | static inline void clocksource_select(void) { } |
| 583 | 583 | ||
| @@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | 639 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ |
| 640 | 640 | ||
| 641 | /** | 641 | /** |
| 642 | * __clocksource_register_scale - Used to install new clocksources | 642 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
| 643 | * @t: clocksource to be registered | 643 | * @t: clocksource to be registered |
| 644 | * @scale: Scale factor multiplied against freq to get clocksource hz | 644 | * @scale: Scale factor multiplied against freq to get clocksource hz |
| 645 | * @freq: clocksource frequency (cycles per second) divided by scale | 645 | * @freq: clocksource frequency (cycles per second) divided by scale |
| 646 | * | 646 | * |
| 647 | * Returns -EBUSY if registration fails, zero otherwise. | 647 | * This should only be called from the clocksource->enable() method. |
| 648 | * | 648 | * |
| 649 | * This *SHOULD NOT* be called directly! Please use the | 649 | * This *SHOULD NOT* be called directly! Please use the |
| 650 | * clocksource_register_hz() or clocksource_register_khz helper functions. | 650 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. |
| 651 | */ | 651 | */ |
| 652 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | 652 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
| 653 | { | 653 | { |
| 654 | |||
| 655 | /* | 654 | /* |
| 656 | * Ideally we want to use some of the limits used in | 655 | * Ideally we want to use some of the limits used in |
| 657 | * clocksource_max_deferment, to provide a more informed | 656 | * clocksource_max_deferment, to provide a more informed |
| @@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 662 | NSEC_PER_SEC/scale, | 661 | NSEC_PER_SEC/scale, |
| 663 | MAX_UPDATE_LENGTH*scale); | 662 | MAX_UPDATE_LENGTH*scale); |
| 664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 663 | cs->max_idle_ns = clocksource_max_deferment(cs); |
| 664 | } | ||
| 665 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | ||
| 666 | |||
| 667 | /** | ||
| 668 | * __clocksource_register_scale - Used to install new clocksources | ||
| 669 | * @t: clocksource to be registered | ||
| 670 | * @scale: Scale factor multiplied against freq to get clocksource hz | ||
| 671 | * @freq: clocksource frequency (cycles per second) divided by scale | ||
| 672 | * | ||
| 673 | * Returns -EBUSY if registration fails, zero otherwise. | ||
| 674 | * | ||
| 675 | * This *SHOULD NOT* be called directly! Please use the | ||
| 676 | * clocksource_register_hz() or clocksource_register_khz helper functions. | ||
| 677 | */ | ||
| 678 | int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | ||
| 679 | { | ||
| 680 | |||
| 681 | /* Intialize mult/shift and max_idle_ns */ | ||
| 682 | __clocksource_updatefreq_scale(cs, scale, freq); | ||
| 665 | 683 | ||
| 684 | /* Add clocksource to the clcoksource list */ | ||
| 666 | mutex_lock(&clocksource_mutex); | 685 | mutex_lock(&clocksource_mutex); |
| 667 | clocksource_enqueue(cs); | 686 | clocksource_enqueue(cs); |
| 668 | clocksource_select(); | 687 | clocksource_select(); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb61..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 325 | } while (read_seqretry(&xtime_lock, seq)); | 325 | } while (read_seqretry(&xtime_lock, seq)); |
| 326 | 326 | ||
| 327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || | 327 | if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || |
| 328 | arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { | 328 | arch_needs_cpu(cpu)) { |
| 329 | next_jiffies = last_jiffies + 1; | 329 | next_jiffies = last_jiffies + 1; |
| 330 | delta_jiffies = 1; | 330 | delta_jiffies = 1; |
| 331 | } else { | 331 | } else { |
| @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 405 | * the scheduler tick in nohz_restart_sched_tick. | 405 | * the scheduler tick in nohz_restart_sched_tick. |
| 406 | */ | 406 | */ |
| 407 | if (!ts->tick_stopped) { | 407 | if (!ts->tick_stopped) { |
| 408 | if (select_nohz_load_balancer(1)) { | 408 | select_nohz_load_balancer(1); |
| 409 | /* | ||
| 410 | * sched tick not stopped! | ||
| 411 | */ | ||
| 412 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
| 413 | goto out; | ||
| 414 | } | ||
| 415 | 409 | ||
| 416 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 410 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
| 417 | ts->tick_stopped = 1; | 411 | ts->tick_stopped = 1; |
| @@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) | |||
| 780 | { | 774 | { |
| 781 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 775 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
| 782 | ktime_t now = ktime_get(); | 776 | ktime_t now = ktime_get(); |
| 783 | u64 offset; | ||
| 784 | 777 | ||
| 785 | /* | 778 | /* |
| 786 | * Emulate tick processing via per-CPU hrtimers: | 779 | * Emulate tick processing via per-CPU hrtimers: |
| @@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) | |||
| 790 | 783 | ||
| 791 | /* Get the next period (per cpu) */ | 784 | /* Get the next period (per cpu) */ |
| 792 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 785 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
| 793 | offset = ktime_to_ns(tick_period) >> 1; | ||
| 794 | do_div(offset, num_possible_cpus()); | ||
| 795 | offset *= smp_processor_id(); | ||
| 796 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
| 797 | 786 | ||
| 798 | for (;;) { | 787 | for (;;) { |
| 799 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 788 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c8..e14c839e9faa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | |||
| 153 | * - wall_to_monotonic is no longer the boot time, getboottime must be | 153 | * - wall_to_monotonic is no longer the boot time, getboottime must be |
| 154 | * used instead. | 154 | * used instead. |
| 155 | */ | 155 | */ |
| 156 | struct timespec xtime __attribute__ ((aligned (16))); | 156 | static struct timespec xtime __attribute__ ((aligned (16))); |
| 157 | struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | 157 | static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); |
| 158 | static struct timespec total_sleep_time; | 158 | static struct timespec total_sleep_time; |
| 159 | 159 | ||
| 160 | /* | 160 | /* |
| @@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) | |||
| 170 | { | 170 | { |
| 171 | xtime.tv_sec += leapsecond; | 171 | xtime.tv_sec += leapsecond; |
| 172 | wall_to_monotonic.tv_sec -= leapsecond; | 172 | wall_to_monotonic.tv_sec -= leapsecond; |
| 173 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 173 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
| 174 | timekeeper.mult); | ||
| 174 | } | 175 | } |
| 175 | 176 | ||
| 176 | #ifdef CONFIG_GENERIC_TIME | ||
| 177 | |||
| 178 | /** | 177 | /** |
| 179 | * timekeeping_forward_now - update clock to the current time | 178 | * timekeeping_forward_now - update clock to the current time |
| 180 | * | 179 | * |
| @@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) | |||
| 328 | timekeeper.ntp_error = 0; | 327 | timekeeper.ntp_error = 0; |
| 329 | ntp_clear(); | 328 | ntp_clear(); |
| 330 | 329 | ||
| 331 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 330 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
| 331 | timekeeper.mult); | ||
| 332 | 332 | ||
| 333 | write_sequnlock_irqrestore(&xtime_lock, flags); | 333 | write_sequnlock_irqrestore(&xtime_lock, flags); |
| 334 | 334 | ||
| @@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) | |||
| 376 | tick_clock_notify(); | 376 | tick_clock_notify(); |
| 377 | } | 377 | } |
| 378 | 378 | ||
| 379 | #else /* GENERIC_TIME */ | ||
| 380 | |||
| 381 | static inline void timekeeping_forward_now(void) { } | ||
| 382 | |||
| 383 | /** | ||
| 384 | * ktime_get - get the monotonic time in ktime_t format | ||
| 385 | * | ||
| 386 | * returns the time in ktime_t format | ||
| 387 | */ | ||
| 388 | ktime_t ktime_get(void) | ||
| 389 | { | ||
| 390 | struct timespec now; | ||
| 391 | |||
| 392 | ktime_get_ts(&now); | ||
| 393 | |||
| 394 | return timespec_to_ktime(now); | ||
| 395 | } | ||
| 396 | EXPORT_SYMBOL_GPL(ktime_get); | ||
| 397 | |||
| 398 | /** | ||
| 399 | * ktime_get_ts - get the monotonic clock in timespec format | ||
| 400 | * @ts: pointer to timespec variable | ||
| 401 | * | ||
| 402 | * The function calculates the monotonic clock from the realtime | ||
| 403 | * clock and the wall_to_monotonic offset and stores the result | ||
| 404 | * in normalized timespec format in the variable pointed to by @ts. | ||
| 405 | */ | ||
| 406 | void ktime_get_ts(struct timespec *ts) | ||
| 407 | { | ||
| 408 | struct timespec tomono; | ||
| 409 | unsigned long seq; | ||
| 410 | |||
| 411 | do { | ||
| 412 | seq = read_seqbegin(&xtime_lock); | ||
| 413 | getnstimeofday(ts); | ||
| 414 | tomono = wall_to_monotonic; | ||
| 415 | |||
| 416 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 417 | |||
| 418 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | ||
| 419 | ts->tv_nsec + tomono.tv_nsec); | ||
| 420 | } | ||
| 421 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
| 422 | |||
| 423 | #endif /* !GENERIC_TIME */ | ||
| 424 | |||
| 425 | /** | 379 | /** |
| 426 | * ktime_get_real - get the real (wall-) time in ktime_t format | 380 | * ktime_get_real - get the real (wall-) time in ktime_t format |
| 427 | * | 381 | * |
| @@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) | |||
| 579 | 533 | ||
| 580 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 534 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
| 581 | ts = timespec_sub(ts, timekeeping_suspend_time); | 535 | ts = timespec_sub(ts, timekeeping_suspend_time); |
| 582 | xtime = timespec_add_safe(xtime, ts); | 536 | xtime = timespec_add(xtime, ts); |
| 583 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | 537 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); |
| 584 | total_sleep_time = timespec_add_safe(total_sleep_time, ts); | 538 | total_sleep_time = timespec_add(total_sleep_time, ts); |
| 585 | } | 539 | } |
| 586 | /* re-base the last cycle value */ | 540 | /* re-base the last cycle value */ |
| 587 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 541 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
| @@ -784,10 +738,11 @@ void update_wall_time(void) | |||
| 784 | return; | 738 | return; |
| 785 | 739 | ||
| 786 | clock = timekeeper.clock; | 740 | clock = timekeeper.clock; |
| 787 | #ifdef CONFIG_GENERIC_TIME | 741 | |
| 788 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | 742 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
| 789 | #else | ||
| 790 | offset = timekeeper.cycle_interval; | 743 | offset = timekeeper.cycle_interval; |
| 744 | #else | ||
| 745 | offset = (clock->read(clock) - clock->cycle_last) & clock->mask; | ||
| 791 | #endif | 746 | #endif |
| 792 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; | 747 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; |
| 793 | 748 | ||
| @@ -856,7 +811,8 @@ void update_wall_time(void) | |||
| 856 | } | 811 | } |
| 857 | 812 | ||
| 858 | /* check to see if there is a new clocksource to use */ | 813 | /* check to see if there is a new clocksource to use */ |
| 859 | update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); | 814 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, |
| 815 | timekeeper.mult); | ||
| 860 | } | 816 | } |
| 861 | 817 | ||
| 862 | /** | 818 | /** |
| @@ -887,7 +843,7 @@ EXPORT_SYMBOL_GPL(getboottime); | |||
| 887 | */ | 843 | */ |
| 888 | void monotonic_to_bootbased(struct timespec *ts) | 844 | void monotonic_to_bootbased(struct timespec *ts) |
| 889 | { | 845 | { |
| 890 | *ts = timespec_add_safe(*ts, total_sleep_time); | 846 | *ts = timespec_add(*ts, total_sleep_time); |
| 891 | } | 847 | } |
| 892 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); | 848 | EXPORT_SYMBOL_GPL(monotonic_to_bootbased); |
| 893 | 849 | ||
| @@ -902,6 +858,11 @@ struct timespec __current_kernel_time(void) | |||
| 902 | return xtime; | 858 | return xtime; |
| 903 | } | 859 | } |
| 904 | 860 | ||
| 861 | struct timespec __get_wall_to_monotonic(void) | ||
| 862 | { | ||
| 863 | return wall_to_monotonic; | ||
| 864 | } | ||
| 865 | |||
| 905 | struct timespec current_kernel_time(void) | 866 | struct timespec current_kernel_time(void) |
| 906 | { | 867 | { |
| 907 | struct timespec now; | 868 | struct timespec now; |
diff --git a/kernel/timer.c b/kernel/timer.c index efde11e197c4..f1b8afe1ad86 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
| 90 | 90 | ||
| 91 | /* | 91 | /* |
| 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
| 93 | * base in timer_list is guaranteed to be zero. Use the LSB for | 93 | * base in timer_list is guaranteed to be zero. Use the LSB to |
| 94 | * the new flag to indicate whether the timer is deferrable | 94 | * indicate whether the timer is deferrable. |
| 95 | * | ||
| 96 | * A deferrable timer will work normally when the system is busy, but | ||
| 97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
| 98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
| 99 | * subsequent non-deferrable timer. | ||
| 95 | */ | 100 | */ |
| 96 | #define TBASE_DEFERRABLE_FLAG (0x1) | 101 | #define TBASE_DEFERRABLE_FLAG (0x1) |
| 97 | 102 | ||
| @@ -692,12 +697,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
| 692 | cpu = smp_processor_id(); | 697 | cpu = smp_processor_id(); |
| 693 | 698 | ||
| 694 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 699 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) |
| 695 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { | 700 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
| 696 | int preferred_cpu = get_nohz_load_balancer(); | 701 | cpu = get_nohz_timer_target(); |
| 697 | |||
| 698 | if (preferred_cpu >= 0) | ||
| 699 | cpu = preferred_cpu; | ||
| 700 | } | ||
| 701 | #endif | 702 | #endif |
| 702 | new_base = per_cpu(tvec_bases, cpu); | 703 | new_base = per_cpu(tvec_bases, cpu); |
| 703 | 704 | ||
| @@ -1302,7 +1303,6 @@ void run_local_timers(void) | |||
| 1302 | { | 1303 | { |
| 1303 | hrtimer_run_queues(); | 1304 | hrtimer_run_queues(); |
| 1304 | raise_softirq(TIMER_SOFTIRQ); | 1305 | raise_softirq(TIMER_SOFTIRQ); |
| 1305 | softlockup_tick(); | ||
| 1306 | } | 1306 | } |
| 1307 | 1307 | ||
| 1308 | /* | 1308 | /* |
| @@ -1763,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
| 1763 | } | 1763 | } |
| 1764 | 1764 | ||
| 1765 | EXPORT_SYMBOL(msleep_interruptible); | 1765 | EXPORT_SYMBOL(msleep_interruptible); |
| 1766 | |||
| 1767 | static int __sched do_usleep_range(unsigned long min, unsigned long max) | ||
| 1768 | { | ||
| 1769 | ktime_t kmin; | ||
| 1770 | unsigned long delta; | ||
| 1771 | |||
| 1772 | kmin = ktime_set(0, min * NSEC_PER_USEC); | ||
| 1773 | delta = (max - min) * NSEC_PER_USEC; | ||
| 1774 | return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); | ||
| 1775 | } | ||
| 1776 | |||
| 1777 | /** | ||
| 1778 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | ||
| 1779 | * @min: Minimum time in usecs to sleep | ||
| 1780 | * @max: Maximum time in usecs to sleep | ||
| 1781 | */ | ||
| 1782 | void usleep_range(unsigned long min, unsigned long max) | ||
| 1783 | { | ||
| 1784 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 1785 | do_usleep_range(min, max); | ||
| 1786 | } | ||
| 1787 | EXPORT_SYMBOL(usleep_range); | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545b..538501c6ea50 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -153,7 +153,7 @@ config IRQSOFF_TRACER | |||
| 153 | bool "Interrupts-off Latency Tracer" | 153 | bool "Interrupts-off Latency Tracer" |
| 154 | default n | 154 | default n |
| 155 | depends on TRACE_IRQFLAGS_SUPPORT | 155 | depends on TRACE_IRQFLAGS_SUPPORT |
| 156 | depends on GENERIC_TIME | 156 | depends on !ARCH_USES_GETTIMEOFFSET |
| 157 | select TRACE_IRQFLAGS | 157 | select TRACE_IRQFLAGS |
| 158 | select GENERIC_TRACER | 158 | select GENERIC_TRACER |
| 159 | select TRACER_MAX_TRACE | 159 | select TRACER_MAX_TRACE |
| @@ -175,7 +175,7 @@ config IRQSOFF_TRACER | |||
| 175 | config PREEMPT_TRACER | 175 | config PREEMPT_TRACER |
| 176 | bool "Preemption-off Latency Tracer" | 176 | bool "Preemption-off Latency Tracer" |
| 177 | default n | 177 | default n |
| 178 | depends on GENERIC_TIME | 178 | depends on !ARCH_USES_GETTIMEOFFSET |
| 179 | depends on PREEMPT | 179 | depends on PREEMPT |
| 180 | select GENERIC_TRACER | 180 | select GENERIC_TRACER |
| 181 | select TRACER_MAX_TRACE | 181 | select TRACER_MAX_TRACE |
| @@ -194,15 +194,6 @@ config PREEMPT_TRACER | |||
| 194 | enabled. This option and the irqs-off timing option can be | 194 | enabled. This option and the irqs-off timing option can be |
| 195 | used together or separately.) | 195 | used together or separately.) |
| 196 | 196 | ||
| 197 | config SYSPROF_TRACER | ||
| 198 | bool "Sysprof Tracer" | ||
| 199 | depends on X86 | ||
| 200 | select GENERIC_TRACER | ||
| 201 | select CONTEXT_SWITCH_TRACER | ||
| 202 | help | ||
| 203 | This tracer provides the trace needed by the 'Sysprof' userspace | ||
| 204 | tool. | ||
| 205 | |||
| 206 | config SCHED_TRACER | 197 | config SCHED_TRACER |
| 207 | bool "Scheduling Latency Tracer" | 198 | bool "Scheduling Latency Tracer" |
| 208 | select GENERIC_TRACER | 199 | select GENERIC_TRACER |
| @@ -229,23 +220,6 @@ config FTRACE_SYSCALLS | |||
| 229 | help | 220 | help |
| 230 | Basic tracer to catch the syscall entry and exit events. | 221 | Basic tracer to catch the syscall entry and exit events. |
| 231 | 222 | ||
| 232 | config BOOT_TRACER | ||
| 233 | bool "Trace boot initcalls" | ||
| 234 | select GENERIC_TRACER | ||
| 235 | select CONTEXT_SWITCH_TRACER | ||
| 236 | help | ||
| 237 | This tracer helps developers to optimize boot times: it records | ||
| 238 | the timings of the initcalls and traces key events and the identity | ||
| 239 | of tasks that can cause boot delays, such as context-switches. | ||
| 240 | |||
| 241 | Its aim is to be parsed by the scripts/bootgraph.pl tool to | ||
| 242 | produce pretty graphics about boot inefficiencies, giving a visual | ||
| 243 | representation of the delays during initcalls - but the raw | ||
| 244 | /debug/tracing/trace text output is readable too. | ||
| 245 | |||
| 246 | You must pass in initcall_debug and ftrace=initcall to the kernel | ||
| 247 | command line to enable this on bootup. | ||
| 248 | |||
| 249 | config TRACE_BRANCH_PROFILING | 223 | config TRACE_BRANCH_PROFILING |
| 250 | bool | 224 | bool |
| 251 | select GENERIC_TRACER | 225 | select GENERIC_TRACER |
| @@ -325,28 +299,6 @@ config BRANCH_TRACER | |||
| 325 | 299 | ||
| 326 | Say N if unsure. | 300 | Say N if unsure. |
| 327 | 301 | ||
| 328 | config KSYM_TRACER | ||
| 329 | bool "Trace read and write access on kernel memory locations" | ||
| 330 | depends on HAVE_HW_BREAKPOINT | ||
| 331 | select TRACING | ||
| 332 | help | ||
| 333 | This tracer helps find read and write operations on any given kernel | ||
| 334 | symbol i.e. /proc/kallsyms. | ||
| 335 | |||
| 336 | config PROFILE_KSYM_TRACER | ||
| 337 | bool "Profile all kernel memory accesses on 'watched' variables" | ||
| 338 | depends on KSYM_TRACER | ||
| 339 | help | ||
| 340 | This tracer profiles kernel accesses on variables watched through the | ||
| 341 | ksym tracer ftrace plugin. Depending upon the hardware, all read | ||
| 342 | and write operations on kernel variables can be monitored for | ||
| 343 | accesses. | ||
| 344 | |||
| 345 | The results will be displayed in: | ||
| 346 | /debugfs/tracing/profile_ksym | ||
| 347 | |||
| 348 | Say N if unsure. | ||
| 349 | |||
| 350 | config STACK_TRACER | 302 | config STACK_TRACER |
| 351 | bool "Trace max stack" | 303 | bool "Trace max stack" |
| 352 | depends on HAVE_FUNCTION_TRACER | 304 | depends on HAVE_FUNCTION_TRACER |
| @@ -371,37 +323,6 @@ config STACK_TRACER | |||
| 371 | 323 | ||
| 372 | Say N if unsure. | 324 | Say N if unsure. |
| 373 | 325 | ||
| 374 | config KMEMTRACE | ||
| 375 | bool "Trace SLAB allocations" | ||
| 376 | select GENERIC_TRACER | ||
| 377 | help | ||
| 378 | kmemtrace provides tracing for slab allocator functions, such as | ||
| 379 | kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected | ||
| 380 | data is then fed to the userspace application in order to analyse | ||
| 381 | allocation hotspots, internal fragmentation and so on, making it | ||
| 382 | possible to see how well an allocator performs, as well as debug | ||
| 383 | and profile kernel code. | ||
| 384 | |||
| 385 | This requires an userspace application to use. See | ||
| 386 | Documentation/trace/kmemtrace.txt for more information. | ||
| 387 | |||
| 388 | Saying Y will make the kernel somewhat larger and slower. However, | ||
| 389 | if you disable kmemtrace at run-time or boot-time, the performance | ||
| 390 | impact is minimal (depending on the arch the kernel is built for). | ||
| 391 | |||
| 392 | If unsure, say N. | ||
| 393 | |||
| 394 | config WORKQUEUE_TRACER | ||
| 395 | bool "Trace workqueues" | ||
| 396 | select GENERIC_TRACER | ||
| 397 | help | ||
| 398 | The workqueue tracer provides some statistical information | ||
| 399 | about each cpu workqueue thread such as the number of the | ||
| 400 | works inserted and executed since their creation. It can help | ||
| 401 | to evaluate the amount of work each of them has to perform. | ||
| 402 | For example it can help a developer to decide whether he should | ||
| 403 | choose a per-cpu workqueue instead of a singlethreaded one. | ||
| 404 | |||
| 405 | config BLK_DEV_IO_TRACE | 326 | config BLK_DEV_IO_TRACE |
| 406 | bool "Support for tracing block IO actions" | 327 | bool "Support for tracing block IO actions" |
| 407 | depends on SYSFS | 328 | depends on SYSFS |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 4215530b490b..53f338190b26 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o | |||
| 30 | obj-$(CONFIG_TRACING) += trace_stat.o | 30 | obj-$(CONFIG_TRACING) += trace_stat.o |
| 31 | obj-$(CONFIG_TRACING) += trace_printk.o | 31 | obj-$(CONFIG_TRACING) += trace_printk.o |
| 32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o | 32 | obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o |
| 33 | obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o | ||
| 34 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o | 33 | obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o |
| 35 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o | 34 | obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o |
| 36 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o | 35 | obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o |
| @@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o | |||
| 38 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o | 37 | obj-$(CONFIG_NOP_TRACER) += trace_nop.o |
| 39 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o | 38 | obj-$(CONFIG_STACK_TRACER) += trace_stack.o |
| 40 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o | 39 | obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o |
| 41 | obj-$(CONFIG_BOOT_TRACER) += trace_boot.o | ||
| 42 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o | 40 | obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o |
| 43 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o | 41 | obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o |
| 44 | obj-$(CONFIG_KMEMTRACE) += kmemtrace.o | ||
| 45 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o | 42 | obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o |
| 46 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o | 43 | obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o |
| 47 | ifeq ($(CONFIG_BLOCK),y) | 44 | ifeq ($(CONFIG_BLOCK),y) |
| @@ -55,7 +52,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
| 55 | endif | 52 | endif |
| 56 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
| 57 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
| 58 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o | ||
| 59 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 55 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o |
| 60 | ifeq ($(CONFIG_TRACING),y) | 56 | ifeq ($(CONFIG_TRACING),y) |
| 61 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f9449..0d88ce9b9fb8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
| 1883 | struct hlist_head *hhd; | 1883 | struct hlist_head *hhd; |
| 1884 | struct hlist_node *n; | 1884 | struct hlist_node *n; |
| 1885 | unsigned long key; | 1885 | unsigned long key; |
| 1886 | int resched; | ||
| 1887 | 1886 | ||
| 1888 | key = hash_long(ip, FTRACE_HASH_BITS); | 1887 | key = hash_long(ip, FTRACE_HASH_BITS); |
| 1889 | 1888 | ||
| @@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | |||
| 1897 | * period. This syncs the hash iteration and freeing of items | 1896 | * period. This syncs the hash iteration and freeing of items |
| 1898 | * on the hash. rcu_read_lock is too dangerous here. | 1897 | * on the hash. rcu_read_lock is too dangerous here. |
| 1899 | */ | 1898 | */ |
| 1900 | resched = ftrace_preempt_disable(); | 1899 | preempt_disable_notrace(); |
| 1901 | hlist_for_each_entry_rcu(entry, n, hhd, node) { | 1900 | hlist_for_each_entry_rcu(entry, n, hhd, node) { |
| 1902 | if (entry->ip == ip) | 1901 | if (entry->ip == ip) |
| 1903 | entry->ops->func(ip, parent_ip, &entry->data); | 1902 | entry->ops->func(ip, parent_ip, &entry->data); |
| 1904 | } | 1903 | } |
| 1905 | ftrace_preempt_enable(resched); | 1904 | preempt_enable_notrace(); |
| 1906 | } | 1905 | } |
| 1907 | 1906 | ||
| 1908 | static struct ftrace_ops trace_probe_ops __read_mostly = | 1907 | static struct ftrace_ops trace_probe_ops __read_mostly = |
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660b..000000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null | |||
| @@ -1,529 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Memory allocator tracing | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 Eduard - Gabriel Munteanu | ||
| 5 | * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> | ||
| 6 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/tracepoint.h> | ||
| 10 | #include <linux/seq_file.h> | ||
| 11 | #include <linux/debugfs.h> | ||
| 12 | #include <linux/dcache.h> | ||
| 13 | #include <linux/fs.h> | ||
| 14 | |||
| 15 | #include <linux/kmemtrace.h> | ||
| 16 | |||
| 17 | #include "trace_output.h" | ||
| 18 | #include "trace.h" | ||
| 19 | |||
| 20 | /* Select an alternative, minimalistic output than the original one */ | ||
| 21 | #define TRACE_KMEM_OPT_MINIMAL 0x1 | ||
| 22 | |||
| 23 | static struct tracer_opt kmem_opts[] = { | ||
| 24 | /* Default disable the minimalistic output */ | ||
| 25 | { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, | ||
| 26 | { } | ||
| 27 | }; | ||
| 28 | |||
| 29 | static struct tracer_flags kmem_tracer_flags = { | ||
| 30 | .val = 0, | ||
| 31 | .opts = kmem_opts | ||
| 32 | }; | ||
| 33 | |||
| 34 | static struct trace_array *kmemtrace_array; | ||
| 35 | |||
| 36 | /* Trace allocations */ | ||
| 37 | static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, | ||
| 38 | unsigned long call_site, | ||
| 39 | const void *ptr, | ||
| 40 | size_t bytes_req, | ||
| 41 | size_t bytes_alloc, | ||
| 42 | gfp_t gfp_flags, | ||
| 43 | int node) | ||
| 44 | { | ||
| 45 | struct ftrace_event_call *call = &event_kmem_alloc; | ||
| 46 | struct trace_array *tr = kmemtrace_array; | ||
| 47 | struct kmemtrace_alloc_entry *entry; | ||
| 48 | struct ring_buffer_event *event; | ||
| 49 | |||
| 50 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
| 51 | if (!event) | ||
| 52 | return; | ||
| 53 | |||
| 54 | entry = ring_buffer_event_data(event); | ||
| 55 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
| 56 | |||
| 57 | entry->ent.type = TRACE_KMEM_ALLOC; | ||
| 58 | entry->type_id = type_id; | ||
| 59 | entry->call_site = call_site; | ||
| 60 | entry->ptr = ptr; | ||
| 61 | entry->bytes_req = bytes_req; | ||
| 62 | entry->bytes_alloc = bytes_alloc; | ||
| 63 | entry->gfp_flags = gfp_flags; | ||
| 64 | entry->node = node; | ||
| 65 | |||
| 66 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
| 67 | ring_buffer_unlock_commit(tr->buffer, event); | ||
| 68 | |||
| 69 | trace_wake_up(); | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline void kmemtrace_free(enum kmemtrace_type_id type_id, | ||
| 73 | unsigned long call_site, | ||
| 74 | const void *ptr) | ||
| 75 | { | ||
| 76 | struct ftrace_event_call *call = &event_kmem_free; | ||
| 77 | struct trace_array *tr = kmemtrace_array; | ||
| 78 | struct kmemtrace_free_entry *entry; | ||
| 79 | struct ring_buffer_event *event; | ||
| 80 | |||
| 81 | event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); | ||
| 82 | if (!event) | ||
| 83 | return; | ||
| 84 | entry = ring_buffer_event_data(event); | ||
| 85 | tracing_generic_entry_update(&entry->ent, 0, 0); | ||
| 86 | |||
| 87 | entry->ent.type = TRACE_KMEM_FREE; | ||
| 88 | entry->type_id = type_id; | ||
| 89 | entry->call_site = call_site; | ||
| 90 | entry->ptr = ptr; | ||
| 91 | |||
| 92 | if (!filter_check_discard(call, entry, tr->buffer, event)) | ||
| 93 | ring_buffer_unlock_commit(tr->buffer, event); | ||
| 94 | |||
| 95 | trace_wake_up(); | ||
| 96 | } | ||
| 97 | |||
| 98 | static void kmemtrace_kmalloc(void *ignore, | ||
| 99 | unsigned long call_site, | ||
| 100 | const void *ptr, | ||
| 101 | size_t bytes_req, | ||
| 102 | size_t bytes_alloc, | ||
| 103 | gfp_t gfp_flags) | ||
| 104 | { | ||
| 105 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
| 106 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
| 107 | } | ||
| 108 | |||
| 109 | static void kmemtrace_kmem_cache_alloc(void *ignore, | ||
| 110 | unsigned long call_site, | ||
| 111 | const void *ptr, | ||
| 112 | size_t bytes_req, | ||
| 113 | size_t bytes_alloc, | ||
| 114 | gfp_t gfp_flags) | ||
| 115 | { | ||
| 116 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
| 117 | bytes_req, bytes_alloc, gfp_flags, -1); | ||
| 118 | } | ||
| 119 | |||
| 120 | static void kmemtrace_kmalloc_node(void *ignore, | ||
| 121 | unsigned long call_site, | ||
| 122 | const void *ptr, | ||
| 123 | size_t bytes_req, | ||
| 124 | size_t bytes_alloc, | ||
| 125 | gfp_t gfp_flags, | ||
| 126 | int node) | ||
| 127 | { | ||
| 128 | kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, | ||
| 129 | bytes_req, bytes_alloc, gfp_flags, node); | ||
| 130 | } | ||
| 131 | |||
| 132 | static void kmemtrace_kmem_cache_alloc_node(void *ignore, | ||
| 133 | unsigned long call_site, | ||
| 134 | const void *ptr, | ||
| 135 | size_t bytes_req, | ||
| 136 | size_t bytes_alloc, | ||
| 137 | gfp_t gfp_flags, | ||
| 138 | int node) | ||
| 139 | { | ||
| 140 | kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, | ||
| 141 | bytes_req, bytes_alloc, gfp_flags, node); | ||
| 142 | } | ||
| 143 | |||
| 144 | static void | ||
| 145 | kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) | ||
| 146 | { | ||
| 147 | kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); | ||
| 148 | } | ||
| 149 | |||
| 150 | static void kmemtrace_kmem_cache_free(void *ignore, | ||
| 151 | unsigned long call_site, const void *ptr) | ||
| 152 | { | ||
| 153 | kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); | ||
| 154 | } | ||
| 155 | |||
| 156 | static int kmemtrace_start_probes(void) | ||
| 157 | { | ||
| 158 | int err; | ||
| 159 | |||
| 160 | err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
| 161 | if (err) | ||
| 162 | return err; | ||
| 163 | err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
| 164 | if (err) | ||
| 165 | return err; | ||
| 166 | err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
| 167 | if (err) | ||
| 168 | return err; | ||
| 169 | err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
| 170 | if (err) | ||
| 171 | return err; | ||
| 172 | err = register_trace_kfree(kmemtrace_kfree, NULL); | ||
| 173 | if (err) | ||
| 174 | return err; | ||
| 175 | err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
| 176 | |||
| 177 | return err; | ||
| 178 | } | ||
| 179 | |||
| 180 | static void kmemtrace_stop_probes(void) | ||
| 181 | { | ||
| 182 | unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); | ||
| 183 | unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); | ||
| 184 | unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); | ||
| 185 | unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); | ||
| 186 | unregister_trace_kfree(kmemtrace_kfree, NULL); | ||
| 187 | unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); | ||
| 188 | } | ||
| 189 | |||
| 190 | static int kmem_trace_init(struct trace_array *tr) | ||
| 191 | { | ||
| 192 | kmemtrace_array = tr; | ||
| 193 | |||
| 194 | tracing_reset_online_cpus(tr); | ||
| 195 | |||
| 196 | kmemtrace_start_probes(); | ||
| 197 | |||
| 198 | return 0; | ||
| 199 | } | ||
| 200 | |||
| 201 | static void kmem_trace_reset(struct trace_array *tr) | ||
| 202 | { | ||
| 203 | kmemtrace_stop_probes(); | ||
| 204 | } | ||
| 205 | |||
| 206 | static void kmemtrace_headers(struct seq_file *s) | ||
| 207 | { | ||
| 208 | /* Don't need headers for the original kmemtrace output */ | ||
| 209 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
| 210 | return; | ||
| 211 | |||
| 212 | seq_printf(s, "#\n"); | ||
| 213 | seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " | ||
| 214 | " POINTER NODE CALLER\n"); | ||
| 215 | seq_printf(s, "# FREE | | | | " | ||
| 216 | " | | | |\n"); | ||
| 217 | seq_printf(s, "# |\n\n"); | ||
| 218 | } | ||
| 219 | |||
| 220 | /* | ||
| 221 | * The following functions give the original output from kmemtrace, | ||
| 222 | * plus the origin CPU, since reordering occurs in-kernel now. | ||
| 223 | */ | ||
| 224 | |||
| 225 | #define KMEMTRACE_USER_ALLOC 0 | ||
| 226 | #define KMEMTRACE_USER_FREE 1 | ||
| 227 | |||
| 228 | struct kmemtrace_user_event { | ||
| 229 | u8 event_id; | ||
| 230 | u8 type_id; | ||
| 231 | u16 event_size; | ||
| 232 | u32 cpu; | ||
| 233 | u64 timestamp; | ||
| 234 | unsigned long call_site; | ||
| 235 | unsigned long ptr; | ||
| 236 | }; | ||
| 237 | |||
| 238 | struct kmemtrace_user_event_alloc { | ||
| 239 | size_t bytes_req; | ||
| 240 | size_t bytes_alloc; | ||
| 241 | unsigned gfp_flags; | ||
| 242 | int node; | ||
| 243 | }; | ||
| 244 | |||
| 245 | static enum print_line_t | ||
| 246 | kmemtrace_print_alloc(struct trace_iterator *iter, int flags, | ||
| 247 | struct trace_event *event) | ||
| 248 | { | ||
| 249 | struct trace_seq *s = &iter->seq; | ||
| 250 | struct kmemtrace_alloc_entry *entry; | ||
| 251 | int ret; | ||
| 252 | |||
| 253 | trace_assign_type(entry, iter->ent); | ||
| 254 | |||
| 255 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " | ||
| 256 | "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", | ||
| 257 | entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, | ||
| 258 | (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, | ||
| 259 | (unsigned long)entry->gfp_flags, entry->node); | ||
| 260 | |||
| 261 | if (!ret) | ||
| 262 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 263 | return TRACE_TYPE_HANDLED; | ||
| 264 | } | ||
| 265 | |||
| 266 | static enum print_line_t | ||
| 267 | kmemtrace_print_free(struct trace_iterator *iter, int flags, | ||
| 268 | struct trace_event *event) | ||
| 269 | { | ||
| 270 | struct trace_seq *s = &iter->seq; | ||
| 271 | struct kmemtrace_free_entry *entry; | ||
| 272 | int ret; | ||
| 273 | |||
| 274 | trace_assign_type(entry, iter->ent); | ||
| 275 | |||
| 276 | ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", | ||
| 277 | entry->type_id, (void *)entry->call_site, | ||
| 278 | (unsigned long)entry->ptr); | ||
| 279 | |||
| 280 | if (!ret) | ||
| 281 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 282 | return TRACE_TYPE_HANDLED; | ||
| 283 | } | ||
| 284 | |||
| 285 | static enum print_line_t | ||
| 286 | kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, | ||
| 287 | struct trace_event *event) | ||
| 288 | { | ||
| 289 | struct trace_seq *s = &iter->seq; | ||
| 290 | struct kmemtrace_alloc_entry *entry; | ||
| 291 | struct kmemtrace_user_event *ev; | ||
| 292 | struct kmemtrace_user_event_alloc *ev_alloc; | ||
| 293 | |||
| 294 | trace_assign_type(entry, iter->ent); | ||
| 295 | |||
| 296 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
| 297 | if (!ev) | ||
| 298 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 299 | |||
| 300 | ev->event_id = KMEMTRACE_USER_ALLOC; | ||
| 301 | ev->type_id = entry->type_id; | ||
| 302 | ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); | ||
| 303 | ev->cpu = iter->cpu; | ||
| 304 | ev->timestamp = iter->ts; | ||
| 305 | ev->call_site = entry->call_site; | ||
| 306 | ev->ptr = (unsigned long)entry->ptr; | ||
| 307 | |||
| 308 | ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); | ||
| 309 | if (!ev_alloc) | ||
| 310 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 311 | |||
| 312 | ev_alloc->bytes_req = entry->bytes_req; | ||
| 313 | ev_alloc->bytes_alloc = entry->bytes_alloc; | ||
| 314 | ev_alloc->gfp_flags = entry->gfp_flags; | ||
| 315 | ev_alloc->node = entry->node; | ||
| 316 | |||
| 317 | return TRACE_TYPE_HANDLED; | ||
| 318 | } | ||
| 319 | |||
| 320 | static enum print_line_t | ||
| 321 | kmemtrace_print_free_user(struct trace_iterator *iter, int flags, | ||
| 322 | struct trace_event *event) | ||
| 323 | { | ||
| 324 | struct trace_seq *s = &iter->seq; | ||
| 325 | struct kmemtrace_free_entry *entry; | ||
| 326 | struct kmemtrace_user_event *ev; | ||
| 327 | |||
| 328 | trace_assign_type(entry, iter->ent); | ||
| 329 | |||
| 330 | ev = trace_seq_reserve(s, sizeof(*ev)); | ||
| 331 | if (!ev) | ||
| 332 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 333 | |||
| 334 | ev->event_id = KMEMTRACE_USER_FREE; | ||
| 335 | ev->type_id = entry->type_id; | ||
| 336 | ev->event_size = sizeof(*ev); | ||
| 337 | ev->cpu = iter->cpu; | ||
| 338 | ev->timestamp = iter->ts; | ||
| 339 | ev->call_site = entry->call_site; | ||
| 340 | ev->ptr = (unsigned long)entry->ptr; | ||
| 341 | |||
| 342 | return TRACE_TYPE_HANDLED; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* The two other following provide a more minimalistic output */ | ||
| 346 | static enum print_line_t | ||
| 347 | kmemtrace_print_alloc_compress(struct trace_iterator *iter) | ||
| 348 | { | ||
| 349 | struct kmemtrace_alloc_entry *entry; | ||
| 350 | struct trace_seq *s = &iter->seq; | ||
| 351 | int ret; | ||
| 352 | |||
| 353 | trace_assign_type(entry, iter->ent); | ||
| 354 | |||
| 355 | /* Alloc entry */ | ||
| 356 | ret = trace_seq_printf(s, " + "); | ||
| 357 | if (!ret) | ||
| 358 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 359 | |||
| 360 | /* Type */ | ||
| 361 | switch (entry->type_id) { | ||
| 362 | case KMEMTRACE_TYPE_KMALLOC: | ||
| 363 | ret = trace_seq_printf(s, "K "); | ||
| 364 | break; | ||
| 365 | case KMEMTRACE_TYPE_CACHE: | ||
| 366 | ret = trace_seq_printf(s, "C "); | ||
| 367 | break; | ||
| 368 | case KMEMTRACE_TYPE_PAGES: | ||
| 369 | ret = trace_seq_printf(s, "P "); | ||
| 370 | break; | ||
| 371 | default: | ||
| 372 | ret = trace_seq_printf(s, "? "); | ||
| 373 | } | ||
| 374 | |||
| 375 | if (!ret) | ||
| 376 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 377 | |||
| 378 | /* Requested */ | ||
| 379 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); | ||
| 380 | if (!ret) | ||
| 381 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 382 | |||
| 383 | /* Allocated */ | ||
| 384 | ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); | ||
| 385 | if (!ret) | ||
| 386 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 387 | |||
| 388 | /* Flags | ||
| 389 | * TODO: would be better to see the name of the GFP flag names | ||
| 390 | */ | ||
| 391 | ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); | ||
| 392 | if (!ret) | ||
| 393 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 394 | |||
| 395 | /* Pointer to allocated */ | ||
| 396 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
| 397 | if (!ret) | ||
| 398 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 399 | |||
| 400 | /* Node and call site*/ | ||
| 401 | ret = trace_seq_printf(s, "%4d %pf\n", entry->node, | ||
| 402 | (void *)entry->call_site); | ||
| 403 | if (!ret) | ||
| 404 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 405 | |||
| 406 | return TRACE_TYPE_HANDLED; | ||
| 407 | } | ||
| 408 | |||
| 409 | static enum print_line_t | ||
| 410 | kmemtrace_print_free_compress(struct trace_iterator *iter) | ||
| 411 | { | ||
| 412 | struct kmemtrace_free_entry *entry; | ||
| 413 | struct trace_seq *s = &iter->seq; | ||
| 414 | int ret; | ||
| 415 | |||
| 416 | trace_assign_type(entry, iter->ent); | ||
| 417 | |||
| 418 | /* Free entry */ | ||
| 419 | ret = trace_seq_printf(s, " - "); | ||
| 420 | if (!ret) | ||
| 421 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 422 | |||
| 423 | /* Type */ | ||
| 424 | switch (entry->type_id) { | ||
| 425 | case KMEMTRACE_TYPE_KMALLOC: | ||
| 426 | ret = trace_seq_printf(s, "K "); | ||
| 427 | break; | ||
| 428 | case KMEMTRACE_TYPE_CACHE: | ||
| 429 | ret = trace_seq_printf(s, "C "); | ||
| 430 | break; | ||
| 431 | case KMEMTRACE_TYPE_PAGES: | ||
| 432 | ret = trace_seq_printf(s, "P "); | ||
| 433 | break; | ||
| 434 | default: | ||
| 435 | ret = trace_seq_printf(s, "? "); | ||
| 436 | } | ||
| 437 | |||
| 438 | if (!ret) | ||
| 439 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 440 | |||
| 441 | /* Skip requested/allocated/flags */ | ||
| 442 | ret = trace_seq_printf(s, " "); | ||
| 443 | if (!ret) | ||
| 444 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 445 | |||
| 446 | /* Pointer to allocated */ | ||
| 447 | ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); | ||
| 448 | if (!ret) | ||
| 449 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 450 | |||
| 451 | /* Skip node and print call site*/ | ||
| 452 | ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); | ||
| 453 | if (!ret) | ||
| 454 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 455 | |||
| 456 | return TRACE_TYPE_HANDLED; | ||
| 457 | } | ||
| 458 | |||
| 459 | static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) | ||
| 460 | { | ||
| 461 | struct trace_entry *entry = iter->ent; | ||
| 462 | |||
| 463 | if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) | ||
| 464 | return TRACE_TYPE_UNHANDLED; | ||
| 465 | |||
| 466 | switch (entry->type) { | ||
| 467 | case TRACE_KMEM_ALLOC: | ||
| 468 | return kmemtrace_print_alloc_compress(iter); | ||
| 469 | case TRACE_KMEM_FREE: | ||
| 470 | return kmemtrace_print_free_compress(iter); | ||
| 471 | default: | ||
| 472 | return TRACE_TYPE_UNHANDLED; | ||
| 473 | } | ||
| 474 | } | ||
| 475 | |||
| 476 | static struct trace_event_functions kmem_trace_alloc_funcs = { | ||
| 477 | .trace = kmemtrace_print_alloc, | ||
| 478 | .binary = kmemtrace_print_alloc_user, | ||
| 479 | }; | ||
| 480 | |||
| 481 | static struct trace_event kmem_trace_alloc = { | ||
| 482 | .type = TRACE_KMEM_ALLOC, | ||
| 483 | .funcs = &kmem_trace_alloc_funcs, | ||
| 484 | }; | ||
| 485 | |||
| 486 | static struct trace_event_functions kmem_trace_free_funcs = { | ||
| 487 | .trace = kmemtrace_print_free, | ||
| 488 | .binary = kmemtrace_print_free_user, | ||
| 489 | }; | ||
| 490 | |||
| 491 | static struct trace_event kmem_trace_free = { | ||
| 492 | .type = TRACE_KMEM_FREE, | ||
| 493 | .funcs = &kmem_trace_free_funcs, | ||
| 494 | }; | ||
| 495 | |||
| 496 | static struct tracer kmem_tracer __read_mostly = { | ||
| 497 | .name = "kmemtrace", | ||
| 498 | .init = kmem_trace_init, | ||
| 499 | .reset = kmem_trace_reset, | ||
| 500 | .print_line = kmemtrace_print_line, | ||
| 501 | .print_header = kmemtrace_headers, | ||
| 502 | .flags = &kmem_tracer_flags | ||
| 503 | }; | ||
| 504 | |||
| 505 | void kmemtrace_init(void) | ||
| 506 | { | ||
| 507 | /* earliest opportunity to start kmem tracing */ | ||
| 508 | } | ||
| 509 | |||
| 510 | static int __init init_kmem_tracer(void) | ||
| 511 | { | ||
| 512 | if (!register_ftrace_event(&kmem_trace_alloc)) { | ||
| 513 | pr_warning("Warning: could not register kmem events\n"); | ||
| 514 | return 1; | ||
| 515 | } | ||
| 516 | |||
| 517 | if (!register_ftrace_event(&kmem_trace_free)) { | ||
| 518 | pr_warning("Warning: could not register kmem events\n"); | ||
| 519 | return 1; | ||
| 520 | } | ||
| 521 | |||
| 522 | if (register_tracer(&kmem_tracer) != 0) { | ||
| 523 | pr_warning("Warning: could not register the kmem tracer\n"); | ||
| 524 | return 1; | ||
| 525 | } | ||
| 526 | |||
| 527 | return 0; | ||
| 528 | } | ||
| 529 | device_initcall(init_kmem_tracer); | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b85..3632ce87674f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) | |||
| 443 | */ | 443 | */ |
| 444 | struct ring_buffer_per_cpu { | 444 | struct ring_buffer_per_cpu { |
| 445 | int cpu; | 445 | int cpu; |
| 446 | atomic_t record_disabled; | ||
| 446 | struct ring_buffer *buffer; | 447 | struct ring_buffer *buffer; |
| 447 | spinlock_t reader_lock; /* serialize readers */ | 448 | spinlock_t reader_lock; /* serialize readers */ |
| 448 | arch_spinlock_t lock; | 449 | arch_spinlock_t lock; |
| @@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { | |||
| 462 | unsigned long read; | 463 | unsigned long read; |
| 463 | u64 write_stamp; | 464 | u64 write_stamp; |
| 464 | u64 read_stamp; | 465 | u64 read_stamp; |
| 465 | atomic_t record_disabled; | ||
| 466 | }; | 466 | }; |
| 467 | 467 | ||
| 468 | struct ring_buffer { | 468 | struct ring_buffer { |
| @@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void) | |||
| 2242 | 2242 | ||
| 2243 | #endif | 2243 | #endif |
| 2244 | 2244 | ||
| 2245 | static DEFINE_PER_CPU(int, rb_need_resched); | ||
| 2246 | |||
| 2247 | /** | 2245 | /** |
| 2248 | * ring_buffer_lock_reserve - reserve a part of the buffer | 2246 | * ring_buffer_lock_reserve - reserve a part of the buffer |
| 2249 | * @buffer: the ring buffer to reserve from | 2247 | * @buffer: the ring buffer to reserve from |
| @@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
| 2264 | { | 2262 | { |
| 2265 | struct ring_buffer_per_cpu *cpu_buffer; | 2263 | struct ring_buffer_per_cpu *cpu_buffer; |
| 2266 | struct ring_buffer_event *event; | 2264 | struct ring_buffer_event *event; |
| 2267 | int cpu, resched; | 2265 | int cpu; |
| 2268 | 2266 | ||
| 2269 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2267 | if (ring_buffer_flags != RB_BUFFERS_ON) |
| 2270 | return NULL; | 2268 | return NULL; |
| 2271 | 2269 | ||
| 2272 | /* If we are tracing schedule, we don't want to recurse */ | 2270 | /* If we are tracing schedule, we don't want to recurse */ |
| 2273 | resched = ftrace_preempt_disable(); | 2271 | preempt_disable_notrace(); |
| 2274 | 2272 | ||
| 2275 | if (atomic_read(&buffer->record_disabled)) | 2273 | if (atomic_read(&buffer->record_disabled)) |
| 2276 | goto out_nocheck; | 2274 | goto out_nocheck; |
| @@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) | |||
| 2295 | if (!event) | 2293 | if (!event) |
| 2296 | goto out; | 2294 | goto out; |
| 2297 | 2295 | ||
| 2298 | /* | ||
| 2299 | * Need to store resched state on this cpu. | ||
| 2300 | * Only the first needs to. | ||
| 2301 | */ | ||
| 2302 | |||
| 2303 | if (preempt_count() == 1) | ||
| 2304 | per_cpu(rb_need_resched, cpu) = resched; | ||
| 2305 | |||
| 2306 | return event; | 2296 | return event; |
| 2307 | 2297 | ||
| 2308 | out: | 2298 | out: |
| 2309 | trace_recursive_unlock(); | 2299 | trace_recursive_unlock(); |
| 2310 | 2300 | ||
| 2311 | out_nocheck: | 2301 | out_nocheck: |
| 2312 | ftrace_preempt_enable(resched); | 2302 | preempt_enable_notrace(); |
| 2313 | return NULL; | 2303 | return NULL; |
| 2314 | } | 2304 | } |
| 2315 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); | 2305 | EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); |
| @@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
| 2355 | 2345 | ||
| 2356 | trace_recursive_unlock(); | 2346 | trace_recursive_unlock(); |
| 2357 | 2347 | ||
| 2358 | /* | 2348 | preempt_enable_notrace(); |
| 2359 | * Only the last preempt count needs to restore preemption. | ||
| 2360 | */ | ||
| 2361 | if (preempt_count() == 1) | ||
| 2362 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
| 2363 | else | ||
| 2364 | preempt_enable_no_resched_notrace(); | ||
| 2365 | 2349 | ||
| 2366 | return 0; | 2350 | return 0; |
| 2367 | } | 2351 | } |
| @@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, | |||
| 2469 | 2453 | ||
| 2470 | trace_recursive_unlock(); | 2454 | trace_recursive_unlock(); |
| 2471 | 2455 | ||
| 2472 | /* | 2456 | preempt_enable_notrace(); |
| 2473 | * Only the last preempt count needs to restore preemption. | ||
| 2474 | */ | ||
| 2475 | if (preempt_count() == 1) | ||
| 2476 | ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); | ||
| 2477 | else | ||
| 2478 | preempt_enable_no_resched_notrace(); | ||
| 2479 | 2457 | ||
| 2480 | } | 2458 | } |
| 2481 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); | 2459 | EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); |
| @@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
| 2501 | struct ring_buffer_event *event; | 2479 | struct ring_buffer_event *event; |
| 2502 | void *body; | 2480 | void *body; |
| 2503 | int ret = -EBUSY; | 2481 | int ret = -EBUSY; |
| 2504 | int cpu, resched; | 2482 | int cpu; |
| 2505 | 2483 | ||
| 2506 | if (ring_buffer_flags != RB_BUFFERS_ON) | 2484 | if (ring_buffer_flags != RB_BUFFERS_ON) |
| 2507 | return -EBUSY; | 2485 | return -EBUSY; |
| 2508 | 2486 | ||
| 2509 | resched = ftrace_preempt_disable(); | 2487 | preempt_disable_notrace(); |
| 2510 | 2488 | ||
| 2511 | if (atomic_read(&buffer->record_disabled)) | 2489 | if (atomic_read(&buffer->record_disabled)) |
| 2512 | goto out; | 2490 | goto out; |
| @@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
| 2536 | 2514 | ||
| 2537 | ret = 0; | 2515 | ret = 0; |
| 2538 | out: | 2516 | out: |
| 2539 | ftrace_preempt_enable(resched); | 2517 | preempt_enable_notrace(); |
| 2540 | 2518 | ||
| 2541 | return ret; | 2519 | return ret; |
| 2542 | } | 2520 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d6736b93dc2a..ba14a22be4cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -341,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
| 341 | /* trace_flags holds trace_options default values */ | 341 | /* trace_flags holds trace_options default values */ |
| 342 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 342 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
| 343 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 343 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
| 344 | TRACE_ITER_GRAPH_TIME; | 344 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; |
| 345 | 345 | ||
| 346 | static int trace_stop_count; | 346 | static int trace_stop_count; |
| 347 | static DEFINE_SPINLOCK(tracing_start_lock); | 347 | static DEFINE_SPINLOCK(tracing_start_lock); |
| @@ -425,6 +425,7 @@ static const char *trace_options[] = { | |||
| 425 | "latency-format", | 425 | "latency-format", |
| 426 | "sleep-time", | 426 | "sleep-time", |
| 427 | "graph-time", | 427 | "graph-time", |
| 428 | "record-cmd", | ||
| 428 | NULL | 429 | NULL |
| 429 | }; | 430 | }; |
| 430 | 431 | ||
| @@ -656,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 656 | return; | 657 | return; |
| 657 | 658 | ||
| 658 | WARN_ON_ONCE(!irqs_disabled()); | 659 | WARN_ON_ONCE(!irqs_disabled()); |
| 660 | if (!current_trace->use_max_tr) { | ||
| 661 | WARN_ON_ONCE(1); | ||
| 662 | return; | ||
| 663 | } | ||
| 659 | arch_spin_lock(&ftrace_max_lock); | 664 | arch_spin_lock(&ftrace_max_lock); |
| 660 | 665 | ||
| 661 | tr->buffer = max_tr.buffer; | 666 | tr->buffer = max_tr.buffer; |
| @@ -682,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 682 | return; | 687 | return; |
| 683 | 688 | ||
| 684 | WARN_ON_ONCE(!irqs_disabled()); | 689 | WARN_ON_ONCE(!irqs_disabled()); |
| 690 | if (!current_trace->use_max_tr) { | ||
| 691 | WARN_ON_ONCE(1); | ||
| 692 | return; | ||
| 693 | } | ||
| 694 | |||
| 685 | arch_spin_lock(&ftrace_max_lock); | 695 | arch_spin_lock(&ftrace_max_lock); |
| 686 | 696 | ||
| 687 | ftrace_disable_cpu(); | 697 | ftrace_disable_cpu(); |
| @@ -726,18 +736,11 @@ __acquires(kernel_lock) | |||
| 726 | return -1; | 736 | return -1; |
| 727 | } | 737 | } |
| 728 | 738 | ||
| 729 | if (strlen(type->name) > MAX_TRACER_SIZE) { | 739 | if (strlen(type->name) >= MAX_TRACER_SIZE) { |
| 730 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); | 740 | pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); |
| 731 | return -1; | 741 | return -1; |
| 732 | } | 742 | } |
| 733 | 743 | ||
| 734 | /* | ||
| 735 | * When this gets called we hold the BKL which means that | ||
| 736 | * preemption is disabled. Various trace selftests however | ||
| 737 | * need to disable and enable preemption for successful tests. | ||
| 738 | * So we drop the BKL here and grab it after the tests again. | ||
| 739 | */ | ||
| 740 | unlock_kernel(); | ||
| 741 | mutex_lock(&trace_types_lock); | 744 | mutex_lock(&trace_types_lock); |
| 742 | 745 | ||
| 743 | tracing_selftest_running = true; | 746 | tracing_selftest_running = true; |
| @@ -819,7 +822,6 @@ __acquires(kernel_lock) | |||
| 819 | #endif | 822 | #endif |
| 820 | 823 | ||
| 821 | out_unlock: | 824 | out_unlock: |
| 822 | lock_kernel(); | ||
| 823 | return ret; | 825 | return ret; |
| 824 | } | 826 | } |
| 825 | 827 | ||
| @@ -1328,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) | |||
| 1328 | 1330 | ||
| 1329 | #endif /* CONFIG_STACKTRACE */ | 1331 | #endif /* CONFIG_STACKTRACE */ |
| 1330 | 1332 | ||
| 1331 | static void | ||
| 1332 | ftrace_trace_special(void *__tr, | ||
| 1333 | unsigned long arg1, unsigned long arg2, unsigned long arg3, | ||
| 1334 | int pc) | ||
| 1335 | { | ||
| 1336 | struct ftrace_event_call *call = &event_special; | ||
| 1337 | struct ring_buffer_event *event; | ||
| 1338 | struct trace_array *tr = __tr; | ||
| 1339 | struct ring_buffer *buffer = tr->buffer; | ||
| 1340 | struct special_entry *entry; | ||
| 1341 | |||
| 1342 | event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, | ||
| 1343 | sizeof(*entry), 0, pc); | ||
| 1344 | if (!event) | ||
| 1345 | return; | ||
| 1346 | entry = ring_buffer_event_data(event); | ||
| 1347 | entry->arg1 = arg1; | ||
| 1348 | entry->arg2 = arg2; | ||
| 1349 | entry->arg3 = arg3; | ||
| 1350 | |||
| 1351 | if (!filter_check_discard(call, entry, buffer, event)) | ||
| 1352 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | void | ||
| 1356 | __trace_special(void *__tr, void *__data, | ||
| 1357 | unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
| 1358 | { | ||
| 1359 | ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); | ||
| 1360 | } | ||
| 1361 | |||
| 1362 | void | ||
| 1363 | ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) | ||
| 1364 | { | ||
| 1365 | struct trace_array *tr = &global_trace; | ||
| 1366 | struct trace_array_cpu *data; | ||
| 1367 | unsigned long flags; | ||
| 1368 | int cpu; | ||
| 1369 | int pc; | ||
| 1370 | |||
| 1371 | if (tracing_disabled) | ||
| 1372 | return; | ||
| 1373 | |||
| 1374 | pc = preempt_count(); | ||
| 1375 | local_irq_save(flags); | ||
| 1376 | cpu = raw_smp_processor_id(); | ||
| 1377 | data = tr->data[cpu]; | ||
| 1378 | |||
| 1379 | if (likely(atomic_inc_return(&data->disabled) == 1)) | ||
| 1380 | ftrace_trace_special(tr, arg1, arg2, arg3, pc); | ||
| 1381 | |||
| 1382 | atomic_dec(&data->disabled); | ||
| 1383 | local_irq_restore(flags); | ||
| 1384 | } | ||
| 1385 | |||
| 1386 | /** | 1333 | /** |
| 1387 | * trace_vbprintk - write binary msg to tracing buffer | 1334 | * trace_vbprintk - write binary msg to tracing buffer |
| 1388 | * | 1335 | * |
| @@ -1401,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
| 1401 | struct bprint_entry *entry; | 1348 | struct bprint_entry *entry; |
| 1402 | unsigned long flags; | 1349 | unsigned long flags; |
| 1403 | int disable; | 1350 | int disable; |
| 1404 | int resched; | ||
| 1405 | int cpu, len = 0, size, pc; | 1351 | int cpu, len = 0, size, pc; |
| 1406 | 1352 | ||
| 1407 | if (unlikely(tracing_selftest_running || tracing_disabled)) | 1353 | if (unlikely(tracing_selftest_running || tracing_disabled)) |
| @@ -1411,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | |||
| 1411 | pause_graph_tracing(); | 1357 | pause_graph_tracing(); |
| 1412 | 1358 | ||
| 1413 | pc = preempt_count(); | 1359 | pc = preempt_count(); |
| 1414 | resched = ftrace_preempt_disable(); | 1360 | preempt_disable_notrace(); |
| 1415 | cpu = raw_smp_processor_id(); | 1361 | cpu = raw_smp_processor_id(); |
| 1416 | data = tr->data[cpu]; | 1362 | data = tr->data[cpu]; |
| 1417 | 1363 | ||
| @@ -1449,7 +1395,7 @@ out_unlock: | |||
| 1449 | 1395 | ||
| 1450 | out: | 1396 | out: |
| 1451 | atomic_dec_return(&data->disabled); | 1397 | atomic_dec_return(&data->disabled); |
| 1452 | ftrace_preempt_enable(resched); | 1398 | preempt_enable_notrace(); |
| 1453 | unpause_graph_tracing(); | 1399 | unpause_graph_tracing(); |
| 1454 | 1400 | ||
| 1455 | return len; | 1401 | return len; |
| @@ -2386,6 +2332,7 @@ static const struct file_operations show_traces_fops = { | |||
| 2386 | .open = show_traces_open, | 2332 | .open = show_traces_open, |
| 2387 | .read = seq_read, | 2333 | .read = seq_read, |
| 2388 | .release = seq_release, | 2334 | .release = seq_release, |
| 2335 | .llseek = seq_lseek, | ||
| 2389 | }; | 2336 | }; |
| 2390 | 2337 | ||
| 2391 | /* | 2338 | /* |
| @@ -2479,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { | |||
| 2479 | .open = tracing_open_generic, | 2426 | .open = tracing_open_generic, |
| 2480 | .read = tracing_cpumask_read, | 2427 | .read = tracing_cpumask_read, |
| 2481 | .write = tracing_cpumask_write, | 2428 | .write = tracing_cpumask_write, |
| 2429 | .llseek = generic_file_llseek, | ||
| 2482 | }; | 2430 | }; |
| 2483 | 2431 | ||
| 2484 | static int tracing_trace_options_show(struct seq_file *m, void *v) | 2432 | static int tracing_trace_options_show(struct seq_file *m, void *v) |
| @@ -2554,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) | |||
| 2554 | trace_flags |= mask; | 2502 | trace_flags |= mask; |
| 2555 | else | 2503 | else |
| 2556 | trace_flags &= ~mask; | 2504 | trace_flags &= ~mask; |
| 2505 | |||
| 2506 | if (mask == TRACE_ITER_RECORD_CMD) | ||
| 2507 | trace_event_enable_cmd_record(enabled); | ||
| 2557 | } | 2508 | } |
| 2558 | 2509 | ||
| 2559 | static ssize_t | 2510 | static ssize_t |
| @@ -2645,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, | |||
| 2645 | static const struct file_operations tracing_readme_fops = { | 2596 | static const struct file_operations tracing_readme_fops = { |
| 2646 | .open = tracing_open_generic, | 2597 | .open = tracing_open_generic, |
| 2647 | .read = tracing_readme_read, | 2598 | .read = tracing_readme_read, |
| 2599 | .llseek = generic_file_llseek, | ||
| 2648 | }; | 2600 | }; |
| 2649 | 2601 | ||
| 2650 | static ssize_t | 2602 | static ssize_t |
| @@ -2695,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, | |||
| 2695 | static const struct file_operations tracing_saved_cmdlines_fops = { | 2647 | static const struct file_operations tracing_saved_cmdlines_fops = { |
| 2696 | .open = tracing_open_generic, | 2648 | .open = tracing_open_generic, |
| 2697 | .read = tracing_saved_cmdlines_read, | 2649 | .read = tracing_saved_cmdlines_read, |
| 2650 | .llseek = generic_file_llseek, | ||
| 2698 | }; | 2651 | }; |
| 2699 | 2652 | ||
| 2700 | static ssize_t | 2653 | static ssize_t |
| @@ -2790,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
| 2790 | if (ret < 0) | 2743 | if (ret < 0) |
| 2791 | return ret; | 2744 | return ret; |
| 2792 | 2745 | ||
| 2746 | if (!current_trace->use_max_tr) | ||
| 2747 | goto out; | ||
| 2748 | |||
| 2793 | ret = ring_buffer_resize(max_tr.buffer, size); | 2749 | ret = ring_buffer_resize(max_tr.buffer, size); |
| 2794 | if (ret < 0) { | 2750 | if (ret < 0) { |
| 2795 | int r; | 2751 | int r; |
| @@ -2817,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
| 2817 | return ret; | 2773 | return ret; |
| 2818 | } | 2774 | } |
| 2819 | 2775 | ||
| 2776 | max_tr.entries = size; | ||
| 2777 | out: | ||
| 2820 | global_trace.entries = size; | 2778 | global_trace.entries = size; |
| 2821 | 2779 | ||
| 2822 | return ret; | 2780 | return ret; |
| 2823 | } | 2781 | } |
| 2824 | 2782 | ||
| 2783 | |||
| 2825 | /** | 2784 | /** |
| 2826 | * tracing_update_buffers - used by tracing facility to expand ring buffers | 2785 | * tracing_update_buffers - used by tracing facility to expand ring buffers |
| 2827 | * | 2786 | * |
| @@ -2882,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) | |||
| 2882 | trace_branch_disable(); | 2841 | trace_branch_disable(); |
| 2883 | if (current_trace && current_trace->reset) | 2842 | if (current_trace && current_trace->reset) |
| 2884 | current_trace->reset(tr); | 2843 | current_trace->reset(tr); |
| 2885 | 2844 | if (current_trace && current_trace->use_max_tr) { | |
| 2845 | /* | ||
| 2846 | * We don't free the ring buffer. instead, resize it because | ||
| 2847 | * The max_tr ring buffer has some state (e.g. ring->clock) and | ||
| 2848 | * we want preserve it. | ||
| 2849 | */ | ||
| 2850 | ring_buffer_resize(max_tr.buffer, 1); | ||
| 2851 | max_tr.entries = 1; | ||
| 2852 | } | ||
| 2886 | destroy_trace_option_files(topts); | 2853 | destroy_trace_option_files(topts); |
| 2887 | 2854 | ||
| 2888 | current_trace = t; | 2855 | current_trace = t; |
| 2889 | 2856 | ||
| 2890 | topts = create_trace_option_files(current_trace); | 2857 | topts = create_trace_option_files(current_trace); |
| 2858 | if (current_trace->use_max_tr) { | ||
| 2859 | ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); | ||
| 2860 | if (ret < 0) | ||
| 2861 | goto out; | ||
| 2862 | max_tr.entries = global_trace.entries; | ||
| 2863 | } | ||
| 2891 | 2864 | ||
| 2892 | if (t->init) { | 2865 | if (t->init) { |
| 2893 | ret = tracer_init(t, tr); | 2866 | ret = tracer_init(t, tr); |
| @@ -3024,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
| 3024 | if (iter->trace->pipe_open) | 2997 | if (iter->trace->pipe_open) |
| 3025 | iter->trace->pipe_open(iter); | 2998 | iter->trace->pipe_open(iter); |
| 3026 | 2999 | ||
| 3000 | nonseekable_open(inode, filp); | ||
| 3027 | out: | 3001 | out: |
| 3028 | mutex_unlock(&trace_types_lock); | 3002 | mutex_unlock(&trace_types_lock); |
| 3029 | return ret; | 3003 | return ret; |
| @@ -3469,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
| 3469 | } | 3443 | } |
| 3470 | 3444 | ||
| 3471 | tracing_start(); | 3445 | tracing_start(); |
| 3472 | max_tr.entries = global_trace.entries; | ||
| 3473 | mutex_unlock(&trace_types_lock); | 3446 | mutex_unlock(&trace_types_lock); |
| 3474 | 3447 | ||
| 3475 | return cnt; | 3448 | return cnt; |
| @@ -3582,18 +3555,21 @@ static const struct file_operations tracing_max_lat_fops = { | |||
| 3582 | .open = tracing_open_generic, | 3555 | .open = tracing_open_generic, |
| 3583 | .read = tracing_max_lat_read, | 3556 | .read = tracing_max_lat_read, |
| 3584 | .write = tracing_max_lat_write, | 3557 | .write = tracing_max_lat_write, |
| 3558 | .llseek = generic_file_llseek, | ||
| 3585 | }; | 3559 | }; |
| 3586 | 3560 | ||
| 3587 | static const struct file_operations tracing_ctrl_fops = { | 3561 | static const struct file_operations tracing_ctrl_fops = { |
| 3588 | .open = tracing_open_generic, | 3562 | .open = tracing_open_generic, |
| 3589 | .read = tracing_ctrl_read, | 3563 | .read = tracing_ctrl_read, |
| 3590 | .write = tracing_ctrl_write, | 3564 | .write = tracing_ctrl_write, |
| 3565 | .llseek = generic_file_llseek, | ||
| 3591 | }; | 3566 | }; |
| 3592 | 3567 | ||
| 3593 | static const struct file_operations set_tracer_fops = { | 3568 | static const struct file_operations set_tracer_fops = { |
| 3594 | .open = tracing_open_generic, | 3569 | .open = tracing_open_generic, |
| 3595 | .read = tracing_set_trace_read, | 3570 | .read = tracing_set_trace_read, |
| 3596 | .write = tracing_set_trace_write, | 3571 | .write = tracing_set_trace_write, |
| 3572 | .llseek = generic_file_llseek, | ||
| 3597 | }; | 3573 | }; |
| 3598 | 3574 | ||
| 3599 | static const struct file_operations tracing_pipe_fops = { | 3575 | static const struct file_operations tracing_pipe_fops = { |
| @@ -3602,17 +3578,20 @@ static const struct file_operations tracing_pipe_fops = { | |||
| 3602 | .read = tracing_read_pipe, | 3578 | .read = tracing_read_pipe, |
| 3603 | .splice_read = tracing_splice_read_pipe, | 3579 | .splice_read = tracing_splice_read_pipe, |
| 3604 | .release = tracing_release_pipe, | 3580 | .release = tracing_release_pipe, |
| 3581 | .llseek = no_llseek, | ||
| 3605 | }; | 3582 | }; |
| 3606 | 3583 | ||
| 3607 | static const struct file_operations tracing_entries_fops = { | 3584 | static const struct file_operations tracing_entries_fops = { |
| 3608 | .open = tracing_open_generic, | 3585 | .open = tracing_open_generic, |
| 3609 | .read = tracing_entries_read, | 3586 | .read = tracing_entries_read, |
| 3610 | .write = tracing_entries_write, | 3587 | .write = tracing_entries_write, |
| 3588 | .llseek = generic_file_llseek, | ||
| 3611 | }; | 3589 | }; |
| 3612 | 3590 | ||
| 3613 | static const struct file_operations tracing_mark_fops = { | 3591 | static const struct file_operations tracing_mark_fops = { |
| 3614 | .open = tracing_open_generic, | 3592 | .open = tracing_open_generic, |
| 3615 | .write = tracing_mark_write, | 3593 | .write = tracing_mark_write, |
| 3594 | .llseek = generic_file_llseek, | ||
| 3616 | }; | 3595 | }; |
| 3617 | 3596 | ||
| 3618 | static const struct file_operations trace_clock_fops = { | 3597 | static const struct file_operations trace_clock_fops = { |
| @@ -3918,6 +3897,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
| 3918 | static const struct file_operations tracing_stats_fops = { | 3897 | static const struct file_operations tracing_stats_fops = { |
| 3919 | .open = tracing_open_generic, | 3898 | .open = tracing_open_generic, |
| 3920 | .read = tracing_stats_read, | 3899 | .read = tracing_stats_read, |
| 3900 | .llseek = generic_file_llseek, | ||
| 3921 | }; | 3901 | }; |
| 3922 | 3902 | ||
| 3923 | #ifdef CONFIG_DYNAMIC_FTRACE | 3903 | #ifdef CONFIG_DYNAMIC_FTRACE |
| @@ -3954,6 +3934,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, | |||
| 3954 | static const struct file_operations tracing_dyn_info_fops = { | 3934 | static const struct file_operations tracing_dyn_info_fops = { |
| 3955 | .open = tracing_open_generic, | 3935 | .open = tracing_open_generic, |
| 3956 | .read = tracing_read_dyn_info, | 3936 | .read = tracing_read_dyn_info, |
| 3937 | .llseek = generic_file_llseek, | ||
| 3957 | }; | 3938 | }; |
| 3958 | #endif | 3939 | #endif |
| 3959 | 3940 | ||
| @@ -4107,6 +4088,7 @@ static const struct file_operations trace_options_fops = { | |||
| 4107 | .open = tracing_open_generic, | 4088 | .open = tracing_open_generic, |
| 4108 | .read = trace_options_read, | 4089 | .read = trace_options_read, |
| 4109 | .write = trace_options_write, | 4090 | .write = trace_options_write, |
| 4091 | .llseek = generic_file_llseek, | ||
| 4110 | }; | 4092 | }; |
| 4111 | 4093 | ||
| 4112 | static ssize_t | 4094 | static ssize_t |
| @@ -4158,6 +4140,7 @@ static const struct file_operations trace_options_core_fops = { | |||
| 4158 | .open = tracing_open_generic, | 4140 | .open = tracing_open_generic, |
| 4159 | .read = trace_options_core_read, | 4141 | .read = trace_options_core_read, |
| 4160 | .write = trace_options_core_write, | 4142 | .write = trace_options_core_write, |
| 4143 | .llseek = generic_file_llseek, | ||
| 4161 | }; | 4144 | }; |
| 4162 | 4145 | ||
| 4163 | struct dentry *trace_create_file(const char *name, | 4146 | struct dentry *trace_create_file(const char *name, |
| @@ -4347,9 +4330,6 @@ static __init int tracer_init_debugfs(void) | |||
| 4347 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 4330 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
| 4348 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 4331 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
| 4349 | #endif | 4332 | #endif |
| 4350 | #ifdef CONFIG_SYSPROF_TRACER | ||
| 4351 | init_tracer_sysprof_debugfs(d_tracer); | ||
| 4352 | #endif | ||
| 4353 | 4333 | ||
| 4354 | create_trace_options_dir(); | 4334 | create_trace_options_dir(); |
| 4355 | 4335 | ||
| @@ -4576,16 +4556,14 @@ __init static int tracer_alloc_buffers(void) | |||
| 4576 | 4556 | ||
| 4577 | 4557 | ||
| 4578 | #ifdef CONFIG_TRACER_MAX_TRACE | 4558 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 4579 | max_tr.buffer = ring_buffer_alloc(ring_buf_size, | 4559 | max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); |
| 4580 | TRACE_BUFFER_FLAGS); | ||
| 4581 | if (!max_tr.buffer) { | 4560 | if (!max_tr.buffer) { |
| 4582 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); | 4561 | printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); |
| 4583 | WARN_ON(1); | 4562 | WARN_ON(1); |
| 4584 | ring_buffer_free(global_trace.buffer); | 4563 | ring_buffer_free(global_trace.buffer); |
| 4585 | goto out_free_cpumask; | 4564 | goto out_free_cpumask; |
| 4586 | } | 4565 | } |
| 4587 | max_tr.entries = ring_buffer_size(max_tr.buffer); | 4566 | max_tr.entries = 1; |
| 4588 | WARN_ON(max_tr.entries != global_trace.entries); | ||
| 4589 | #endif | 4567 | #endif |
| 4590 | 4568 | ||
| 4591 | /* Allocate the first page for all buffers */ | 4569 | /* Allocate the first page for all buffers */ |
| @@ -4598,9 +4576,6 @@ __init static int tracer_alloc_buffers(void) | |||
| 4598 | 4576 | ||
| 4599 | register_tracer(&nop_trace); | 4577 | register_tracer(&nop_trace); |
| 4600 | current_trace = &nop_trace; | 4578 | current_trace = &nop_trace; |
| 4601 | #ifdef CONFIG_BOOT_TRACER | ||
| 4602 | register_tracer(&boot_tracer); | ||
| 4603 | #endif | ||
| 4604 | /* All seems OK, enable tracing */ | 4579 | /* All seems OK, enable tracing */ |
| 4605 | tracing_disabled = 0; | 4580 | tracing_disabled = 0; |
| 4606 | 4581 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0605fc00c176..d39b3c5454a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -9,10 +9,7 @@ | |||
| 9 | #include <linux/mmiotrace.h> | 9 | #include <linux/mmiotrace.h> |
| 10 | #include <linux/tracepoint.h> | 10 | #include <linux/tracepoint.h> |
| 11 | #include <linux/ftrace.h> | 11 | #include <linux/ftrace.h> |
| 12 | #include <trace/boot.h> | ||
| 13 | #include <linux/kmemtrace.h> | ||
| 14 | #include <linux/hw_breakpoint.h> | 12 | #include <linux/hw_breakpoint.h> |
| 15 | |||
| 16 | #include <linux/trace_seq.h> | 13 | #include <linux/trace_seq.h> |
| 17 | #include <linux/ftrace_event.h> | 14 | #include <linux/ftrace_event.h> |
| 18 | 15 | ||
| @@ -25,30 +22,17 @@ enum trace_type { | |||
| 25 | TRACE_STACK, | 22 | TRACE_STACK, |
| 26 | TRACE_PRINT, | 23 | TRACE_PRINT, |
| 27 | TRACE_BPRINT, | 24 | TRACE_BPRINT, |
| 28 | TRACE_SPECIAL, | ||
| 29 | TRACE_MMIO_RW, | 25 | TRACE_MMIO_RW, |
| 30 | TRACE_MMIO_MAP, | 26 | TRACE_MMIO_MAP, |
| 31 | TRACE_BRANCH, | 27 | TRACE_BRANCH, |
| 32 | TRACE_BOOT_CALL, | ||
| 33 | TRACE_BOOT_RET, | ||
| 34 | TRACE_GRAPH_RET, | 28 | TRACE_GRAPH_RET, |
| 35 | TRACE_GRAPH_ENT, | 29 | TRACE_GRAPH_ENT, |
| 36 | TRACE_USER_STACK, | 30 | TRACE_USER_STACK, |
| 37 | TRACE_KMEM_ALLOC, | ||
| 38 | TRACE_KMEM_FREE, | ||
| 39 | TRACE_BLK, | 31 | TRACE_BLK, |
| 40 | TRACE_KSYM, | ||
| 41 | 32 | ||
| 42 | __TRACE_LAST_TYPE, | 33 | __TRACE_LAST_TYPE, |
| 43 | }; | 34 | }; |
| 44 | 35 | ||
| 45 | enum kmemtrace_type_id { | ||
| 46 | KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ | ||
| 47 | KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ | ||
| 48 | KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ | ||
| 49 | }; | ||
| 50 | |||
| 51 | extern struct tracer boot_tracer; | ||
| 52 | 36 | ||
| 53 | #undef __field | 37 | #undef __field |
| 54 | #define __field(type, item) type item; | 38 | #define __field(type, item) type item; |
| @@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); | |||
| 204 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ | 188 | IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ |
| 205 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ | 189 | IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ |
| 206 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ | 190 | IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ |
| 207 | IF_ASSIGN(var, ent, struct special_entry, 0); \ | ||
| 208 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ | 191 | IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ |
| 209 | TRACE_MMIO_RW); \ | 192 | TRACE_MMIO_RW); \ |
| 210 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ | 193 | IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ |
| 211 | TRACE_MMIO_MAP); \ | 194 | TRACE_MMIO_MAP); \ |
| 212 | IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ | ||
| 213 | IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ | ||
| 214 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ | 195 | IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ |
| 215 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ | 196 | IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ |
| 216 | TRACE_GRAPH_ENT); \ | 197 | TRACE_GRAPH_ENT); \ |
| 217 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ | 198 | IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ |
| 218 | TRACE_GRAPH_RET); \ | 199 | TRACE_GRAPH_RET); \ |
| 219 | IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ | ||
| 220 | TRACE_KMEM_ALLOC); \ | ||
| 221 | IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ | ||
| 222 | TRACE_KMEM_FREE); \ | ||
| 223 | IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ | ||
| 224 | __ftrace_bad_type(); \ | 200 | __ftrace_bad_type(); \ |
| 225 | } while (0) | 201 | } while (0) |
| 226 | 202 | ||
| @@ -298,6 +274,7 @@ struct tracer { | |||
| 298 | struct tracer *next; | 274 | struct tracer *next; |
| 299 | int print_max; | 275 | int print_max; |
| 300 | struct tracer_flags *flags; | 276 | struct tracer_flags *flags; |
| 277 | int use_max_tr; | ||
| 301 | }; | 278 | }; |
| 302 | 279 | ||
| 303 | 280 | ||
| @@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, | |||
| 318 | const struct file_operations *fops); | 295 | const struct file_operations *fops); |
| 319 | 296 | ||
| 320 | struct dentry *tracing_init_dentry(void); | 297 | struct dentry *tracing_init_dentry(void); |
| 321 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer); | ||
| 322 | 298 | ||
| 323 | struct ring_buffer_event; | 299 | struct ring_buffer_event; |
| 324 | 300 | ||
| @@ -363,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, | |||
| 363 | struct task_struct *wakee, | 339 | struct task_struct *wakee, |
| 364 | struct task_struct *cur, | 340 | struct task_struct *cur, |
| 365 | unsigned long flags, int pc); | 341 | unsigned long flags, int pc); |
| 366 | void trace_special(struct trace_array *tr, | ||
| 367 | struct trace_array_cpu *data, | ||
| 368 | unsigned long arg1, | ||
| 369 | unsigned long arg2, | ||
| 370 | unsigned long arg3, int pc); | ||
| 371 | void trace_function(struct trace_array *tr, | 342 | void trace_function(struct trace_array *tr, |
| 372 | unsigned long ip, | 343 | unsigned long ip, |
| 373 | unsigned long parent_ip, | 344 | unsigned long parent_ip, |
| @@ -398,8 +369,6 @@ extern cpumask_var_t __read_mostly tracing_buffer_mask; | |||
| 398 | #define for_each_tracing_cpu(cpu) \ | 369 | #define for_each_tracing_cpu(cpu) \ |
| 399 | for_each_cpu(cpu, tracing_buffer_mask) | 370 | for_each_cpu(cpu, tracing_buffer_mask) |
| 400 | 371 | ||
| 401 | extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); | ||
| 402 | |||
| 403 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); | 372 | extern unsigned long nsecs_to_usecs(unsigned long nsecs); |
| 404 | 373 | ||
| 405 | extern unsigned long tracing_thresh; | 374 | extern unsigned long tracing_thresh; |
| @@ -469,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, | |||
| 469 | struct trace_array *tr); | 438 | struct trace_array *tr); |
| 470 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, | 439 | extern int trace_selftest_startup_sched_switch(struct tracer *trace, |
| 471 | struct trace_array *tr); | 440 | struct trace_array *tr); |
| 472 | extern int trace_selftest_startup_sysprof(struct tracer *trace, | ||
| 473 | struct trace_array *tr); | ||
| 474 | extern int trace_selftest_startup_branch(struct tracer *trace, | 441 | extern int trace_selftest_startup_branch(struct tracer *trace, |
| 475 | struct trace_array *tr); | 442 | struct trace_array *tr); |
| 476 | extern int trace_selftest_startup_ksym(struct tracer *trace, | ||
| 477 | struct trace_array *tr); | ||
| 478 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | 443 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ |
| 479 | 444 | ||
| 480 | extern void *head_page(struct trace_array_cpu *data); | 445 | extern void *head_page(struct trace_array_cpu *data); |
| @@ -636,6 +601,7 @@ enum trace_iterator_flags { | |||
| 636 | TRACE_ITER_LATENCY_FMT = 0x20000, | 601 | TRACE_ITER_LATENCY_FMT = 0x20000, |
| 637 | TRACE_ITER_SLEEP_TIME = 0x40000, | 602 | TRACE_ITER_SLEEP_TIME = 0x40000, |
| 638 | TRACE_ITER_GRAPH_TIME = 0x80000, | 603 | TRACE_ITER_GRAPH_TIME = 0x80000, |
| 604 | TRACE_ITER_RECORD_CMD = 0x100000, | ||
| 639 | }; | 605 | }; |
| 640 | 606 | ||
| 641 | /* | 607 | /* |
| @@ -647,54 +613,6 @@ enum trace_iterator_flags { | |||
| 647 | 613 | ||
| 648 | extern struct tracer nop_trace; | 614 | extern struct tracer nop_trace; |
| 649 | 615 | ||
| 650 | /** | ||
| 651 | * ftrace_preempt_disable - disable preemption scheduler safe | ||
| 652 | * | ||
| 653 | * When tracing can happen inside the scheduler, there exists | ||
| 654 | * cases that the tracing might happen before the need_resched | ||
| 655 | * flag is checked. If this happens and the tracer calls | ||
| 656 | * preempt_enable (after a disable), a schedule might take place | ||
| 657 | * causing an infinite recursion. | ||
| 658 | * | ||
| 659 | * To prevent this, we read the need_resched flag before | ||
| 660 | * disabling preemption. When we want to enable preemption we | ||
| 661 | * check the flag, if it is set, then we call preempt_enable_no_resched. | ||
| 662 | * Otherwise, we call preempt_enable. | ||
| 663 | * | ||
| 664 | * The rational for doing the above is that if need_resched is set | ||
| 665 | * and we have yet to reschedule, we are either in an atomic location | ||
| 666 | * (where we do not need to check for scheduling) or we are inside | ||
| 667 | * the scheduler and do not want to resched. | ||
| 668 | */ | ||
| 669 | static inline int ftrace_preempt_disable(void) | ||
| 670 | { | ||
| 671 | int resched; | ||
| 672 | |||
| 673 | resched = need_resched(); | ||
| 674 | preempt_disable_notrace(); | ||
| 675 | |||
| 676 | return resched; | ||
| 677 | } | ||
| 678 | |||
| 679 | /** | ||
| 680 | * ftrace_preempt_enable - enable preemption scheduler safe | ||
| 681 | * @resched: the return value from ftrace_preempt_disable | ||
| 682 | * | ||
| 683 | * This is a scheduler safe way to enable preemption and not miss | ||
| 684 | * any preemption checks. The disabled saved the state of preemption. | ||
| 685 | * If resched is set, then we are either inside an atomic or | ||
| 686 | * are inside the scheduler (we would have already scheduled | ||
| 687 | * otherwise). In this case, we do not want to call normal | ||
| 688 | * preempt_enable, but preempt_enable_no_resched instead. | ||
| 689 | */ | ||
| 690 | static inline void ftrace_preempt_enable(int resched) | ||
| 691 | { | ||
| 692 | if (resched) | ||
| 693 | preempt_enable_no_resched_notrace(); | ||
| 694 | else | ||
| 695 | preempt_enable_notrace(); | ||
| 696 | } | ||
| 697 | |||
| 698 | #ifdef CONFIG_BRANCH_TRACER | 616 | #ifdef CONFIG_BRANCH_TRACER |
| 699 | extern int enable_branch_tracing(struct trace_array *tr); | 617 | extern int enable_branch_tracing(struct trace_array *tr); |
| 700 | extern void disable_branch_tracing(void); | 618 | extern void disable_branch_tracing(void); |
| @@ -785,6 +703,8 @@ struct filter_pred { | |||
| 785 | int pop_n; | 703 | int pop_n; |
| 786 | }; | 704 | }; |
| 787 | 705 | ||
| 706 | extern struct list_head ftrace_common_fields; | ||
| 707 | |||
| 788 | extern enum regex_type | 708 | extern enum regex_type |
| 789 | filter_parse_regex(char *buff, int len, char **search, int *not); | 709 | filter_parse_regex(char *buff, int len, char **search, int *not); |
| 790 | extern void print_event_filter(struct ftrace_event_call *call, | 710 | extern void print_event_filter(struct ftrace_event_call *call, |
| @@ -814,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
| 814 | return 0; | 734 | return 0; |
| 815 | } | 735 | } |
| 816 | 736 | ||
| 737 | extern void trace_event_enable_cmd_record(bool enable); | ||
| 738 | |||
| 817 | extern struct mutex event_mutex; | 739 | extern struct mutex event_mutex; |
| 818 | extern struct list_head ftrace_events; | 740 | extern struct list_head ftrace_events; |
| 819 | 741 | ||
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956ad..000000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null | |||
| @@ -1,185 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * ring buffer based initcalls tracer | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> | ||
| 5 | * | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/init.h> | ||
| 9 | #include <linux/debugfs.h> | ||
| 10 | #include <linux/ftrace.h> | ||
| 11 | #include <linux/kallsyms.h> | ||
| 12 | #include <linux/time.h> | ||
| 13 | |||
| 14 | #include "trace.h" | ||
| 15 | #include "trace_output.h" | ||
| 16 | |||
| 17 | static struct trace_array *boot_trace; | ||
| 18 | static bool pre_initcalls_finished; | ||
| 19 | |||
| 20 | /* Tells the boot tracer that the pre_smp_initcalls are finished. | ||
| 21 | * So we are ready . | ||
| 22 | * It doesn't enable sched events tracing however. | ||
| 23 | * You have to call enable_boot_trace to do so. | ||
| 24 | */ | ||
| 25 | void start_boot_trace(void) | ||
| 26 | { | ||
| 27 | pre_initcalls_finished = true; | ||
| 28 | } | ||
| 29 | |||
| 30 | void enable_boot_trace(void) | ||
| 31 | { | ||
| 32 | if (boot_trace && pre_initcalls_finished) | ||
| 33 | tracing_start_sched_switch_record(); | ||
| 34 | } | ||
| 35 | |||
| 36 | void disable_boot_trace(void) | ||
| 37 | { | ||
| 38 | if (boot_trace && pre_initcalls_finished) | ||
| 39 | tracing_stop_sched_switch_record(); | ||
| 40 | } | ||
| 41 | |||
| 42 | static int boot_trace_init(struct trace_array *tr) | ||
| 43 | { | ||
| 44 | boot_trace = tr; | ||
| 45 | |||
| 46 | if (!tr) | ||
| 47 | return 0; | ||
| 48 | |||
| 49 | tracing_reset_online_cpus(tr); | ||
| 50 | |||
| 51 | tracing_sched_switch_assign_trace(tr); | ||
| 52 | return 0; | ||
| 53 | } | ||
| 54 | |||
| 55 | static enum print_line_t | ||
| 56 | initcall_call_print_line(struct trace_iterator *iter) | ||
| 57 | { | ||
| 58 | struct trace_entry *entry = iter->ent; | ||
| 59 | struct trace_seq *s = &iter->seq; | ||
| 60 | struct trace_boot_call *field; | ||
| 61 | struct boot_trace_call *call; | ||
| 62 | u64 ts; | ||
| 63 | unsigned long nsec_rem; | ||
| 64 | int ret; | ||
| 65 | |||
| 66 | trace_assign_type(field, entry); | ||
| 67 | call = &field->boot_call; | ||
| 68 | ts = iter->ts; | ||
| 69 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
| 70 | |||
| 71 | ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", | ||
| 72 | (unsigned long)ts, nsec_rem, call->func, call->caller); | ||
| 73 | |||
| 74 | if (!ret) | ||
| 75 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 76 | else | ||
| 77 | return TRACE_TYPE_HANDLED; | ||
| 78 | } | ||
| 79 | |||
| 80 | static enum print_line_t | ||
| 81 | initcall_ret_print_line(struct trace_iterator *iter) | ||
| 82 | { | ||
| 83 | struct trace_entry *entry = iter->ent; | ||
| 84 | struct trace_seq *s = &iter->seq; | ||
| 85 | struct trace_boot_ret *field; | ||
| 86 | struct boot_trace_ret *init_ret; | ||
| 87 | u64 ts; | ||
| 88 | unsigned long nsec_rem; | ||
| 89 | int ret; | ||
| 90 | |||
| 91 | trace_assign_type(field, entry); | ||
| 92 | init_ret = &field->boot_ret; | ||
| 93 | ts = iter->ts; | ||
| 94 | nsec_rem = do_div(ts, NSEC_PER_SEC); | ||
| 95 | |||
| 96 | ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " | ||
| 97 | "returned %d after %llu msecs\n", | ||
| 98 | (unsigned long) ts, | ||
| 99 | nsec_rem, | ||
| 100 | init_ret->func, init_ret->result, init_ret->duration); | ||
| 101 | |||
| 102 | if (!ret) | ||
| 103 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 104 | else | ||
| 105 | return TRACE_TYPE_HANDLED; | ||
| 106 | } | ||
| 107 | |||
| 108 | static enum print_line_t initcall_print_line(struct trace_iterator *iter) | ||
| 109 | { | ||
| 110 | struct trace_entry *entry = iter->ent; | ||
| 111 | |||
| 112 | switch (entry->type) { | ||
| 113 | case TRACE_BOOT_CALL: | ||
| 114 | return initcall_call_print_line(iter); | ||
| 115 | case TRACE_BOOT_RET: | ||
| 116 | return initcall_ret_print_line(iter); | ||
| 117 | default: | ||
| 118 | return TRACE_TYPE_UNHANDLED; | ||
| 119 | } | ||
| 120 | } | ||
| 121 | |||
| 122 | struct tracer boot_tracer __read_mostly = | ||
| 123 | { | ||
| 124 | .name = "initcall", | ||
| 125 | .init = boot_trace_init, | ||
| 126 | .reset = tracing_reset_online_cpus, | ||
| 127 | .print_line = initcall_print_line, | ||
| 128 | }; | ||
| 129 | |||
| 130 | void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) | ||
| 131 | { | ||
| 132 | struct ftrace_event_call *call = &event_boot_call; | ||
| 133 | struct ring_buffer_event *event; | ||
| 134 | struct ring_buffer *buffer; | ||
| 135 | struct trace_boot_call *entry; | ||
| 136 | struct trace_array *tr = boot_trace; | ||
| 137 | |||
| 138 | if (!tr || !pre_initcalls_finished) | ||
| 139 | return; | ||
| 140 | |||
| 141 | /* Get its name now since this function could | ||
| 142 | * disappear because it is in the .init section. | ||
| 143 | */ | ||
| 144 | sprint_symbol(bt->func, (unsigned long)fn); | ||
| 145 | preempt_disable(); | ||
| 146 | |||
| 147 | buffer = tr->buffer; | ||
| 148 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, | ||
| 149 | sizeof(*entry), 0, 0); | ||
| 150 | if (!event) | ||
| 151 | goto out; | ||
| 152 | entry = ring_buffer_event_data(event); | ||
| 153 | entry->boot_call = *bt; | ||
| 154 | if (!filter_check_discard(call, entry, buffer, event)) | ||
| 155 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
| 156 | out: | ||
| 157 | preempt_enable(); | ||
| 158 | } | ||
| 159 | |||
| 160 | void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) | ||
| 161 | { | ||
| 162 | struct ftrace_event_call *call = &event_boot_ret; | ||
| 163 | struct ring_buffer_event *event; | ||
| 164 | struct ring_buffer *buffer; | ||
| 165 | struct trace_boot_ret *entry; | ||
| 166 | struct trace_array *tr = boot_trace; | ||
| 167 | |||
| 168 | if (!tr || !pre_initcalls_finished) | ||
| 169 | return; | ||
| 170 | |||
| 171 | sprint_symbol(bt->func, (unsigned long)fn); | ||
| 172 | preempt_disable(); | ||
| 173 | |||
| 174 | buffer = tr->buffer; | ||
| 175 | event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, | ||
| 176 | sizeof(*entry), 0, 0); | ||
| 177 | if (!event) | ||
| 178 | goto out; | ||
| 179 | entry = ring_buffer_event_data(event); | ||
| 180 | entry->boot_ret = *bt; | ||
| 181 | if (!filter_check_discard(call, entry, buffer, event)) | ||
| 182 | trace_buffer_unlock_commit(buffer, event, 0, 0); | ||
| 183 | out: | ||
| 184 | preempt_enable(); | ||
| 185 | } | ||
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1a..685a67d55db0 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
| @@ -32,16 +32,15 @@ | |||
| 32 | u64 notrace trace_clock_local(void) | 32 | u64 notrace trace_clock_local(void) |
| 33 | { | 33 | { |
| 34 | u64 clock; | 34 | u64 clock; |
| 35 | int resched; | ||
| 36 | 35 | ||
| 37 | /* | 36 | /* |
| 38 | * sched_clock() is an architecture implemented, fast, scalable, | 37 | * sched_clock() is an architecture implemented, fast, scalable, |
| 39 | * lockless clock. It is not guaranteed to be coherent across | 38 | * lockless clock. It is not guaranteed to be coherent across |
| 40 | * CPUs, nor across CPU idle events. | 39 | * CPUs, nor across CPU idle events. |
| 41 | */ | 40 | */ |
| 42 | resched = ftrace_preempt_disable(); | 41 | preempt_disable_notrace(); |
| 43 | clock = sched_clock(); | 42 | clock = sched_clock(); |
| 44 | ftrace_preempt_enable(resched); | 43 | preempt_enable_notrace(); |
| 45 | 44 | ||
| 46 | return clock; | 45 | return clock; |
| 47 | } | 46 | } |
| @@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) | |||
| 56 | */ | 55 | */ |
| 57 | u64 notrace trace_clock(void) | 56 | u64 notrace trace_clock(void) |
| 58 | { | 57 | { |
| 59 | return cpu_clock(raw_smp_processor_id()); | 58 | return local_clock(); |
| 60 | } | 59 | } |
| 61 | 60 | ||
| 62 | 61 | ||
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240da..e3dfecaf13e6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, | |||
| 151 | ); | 151 | ); |
| 152 | 152 | ||
| 153 | /* | 153 | /* |
| 154 | * Special (free-form) trace entry: | ||
| 155 | */ | ||
| 156 | FTRACE_ENTRY(special, special_entry, | ||
| 157 | |||
| 158 | TRACE_SPECIAL, | ||
| 159 | |||
| 160 | F_STRUCT( | ||
| 161 | __field( unsigned long, arg1 ) | ||
| 162 | __field( unsigned long, arg2 ) | ||
| 163 | __field( unsigned long, arg3 ) | ||
| 164 | ), | ||
| 165 | |||
| 166 | F_printk("(%08lx) (%08lx) (%08lx)", | ||
| 167 | __entry->arg1, __entry->arg2, __entry->arg3) | ||
| 168 | ); | ||
| 169 | |||
| 170 | /* | ||
| 171 | * Stack-trace entry: | 154 | * Stack-trace entry: |
| 172 | */ | 155 | */ |
| 173 | 156 | ||
| @@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, | |||
| 271 | __entry->map_id, __entry->opcode) | 254 | __entry->map_id, __entry->opcode) |
| 272 | ); | 255 | ); |
| 273 | 256 | ||
| 274 | FTRACE_ENTRY(boot_call, trace_boot_call, | ||
| 275 | |||
| 276 | TRACE_BOOT_CALL, | ||
| 277 | |||
| 278 | F_STRUCT( | ||
| 279 | __field_struct( struct boot_trace_call, boot_call ) | ||
| 280 | __field_desc( pid_t, boot_call, caller ) | ||
| 281 | __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) | ||
| 282 | ), | ||
| 283 | |||
| 284 | F_printk("%d %s", __entry->caller, __entry->func) | ||
| 285 | ); | ||
| 286 | |||
| 287 | FTRACE_ENTRY(boot_ret, trace_boot_ret, | ||
| 288 | |||
| 289 | TRACE_BOOT_RET, | ||
| 290 | |||
| 291 | F_STRUCT( | ||
| 292 | __field_struct( struct boot_trace_ret, boot_ret ) | ||
| 293 | __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) | ||
| 294 | __field_desc( int, boot_ret, result ) | ||
| 295 | __field_desc( unsigned long, boot_ret, duration ) | ||
| 296 | ), | ||
| 297 | |||
| 298 | F_printk("%s %d %lx", | ||
| 299 | __entry->func, __entry->result, __entry->duration) | ||
| 300 | ); | ||
| 301 | 257 | ||
| 302 | #define TRACE_FUNC_SIZE 30 | 258 | #define TRACE_FUNC_SIZE 30 |
| 303 | #define TRACE_FILE_SIZE 20 | 259 | #define TRACE_FILE_SIZE 20 |
| @@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, | |||
| 318 | __entry->func, __entry->file, __entry->correct) | 274 | __entry->func, __entry->file, __entry->correct) |
| 319 | ); | 275 | ); |
| 320 | 276 | ||
| 321 | FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, | ||
| 322 | |||
| 323 | TRACE_KMEM_ALLOC, | ||
| 324 | |||
| 325 | F_STRUCT( | ||
| 326 | __field( enum kmemtrace_type_id, type_id ) | ||
| 327 | __field( unsigned long, call_site ) | ||
| 328 | __field( const void *, ptr ) | ||
| 329 | __field( size_t, bytes_req ) | ||
| 330 | __field( size_t, bytes_alloc ) | ||
| 331 | __field( gfp_t, gfp_flags ) | ||
| 332 | __field( int, node ) | ||
| 333 | ), | ||
| 334 | |||
| 335 | F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" | ||
| 336 | " flags:%x node:%d", | ||
| 337 | __entry->type_id, __entry->call_site, __entry->ptr, | ||
| 338 | __entry->bytes_req, __entry->bytes_alloc, | ||
| 339 | __entry->gfp_flags, __entry->node) | ||
| 340 | ); | ||
| 341 | |||
| 342 | FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, | ||
| 343 | |||
| 344 | TRACE_KMEM_FREE, | ||
| 345 | |||
| 346 | F_STRUCT( | ||
| 347 | __field( enum kmemtrace_type_id, type_id ) | ||
| 348 | __field( unsigned long, call_site ) | ||
| 349 | __field( const void *, ptr ) | ||
| 350 | ), | ||
| 351 | |||
| 352 | F_printk("type:%u call_site:%lx ptr:%p", | ||
| 353 | __entry->type_id, __entry->call_site, __entry->ptr) | ||
| 354 | ); | ||
| 355 | |||
| 356 | FTRACE_ENTRY(ksym_trace, ksym_trace_entry, | ||
| 357 | |||
| 358 | TRACE_KSYM, | ||
| 359 | |||
| 360 | F_STRUCT( | ||
| 361 | __field( unsigned long, ip ) | ||
| 362 | __field( unsigned char, type ) | ||
| 363 | __array( char , cmd, TASK_COMM_LEN ) | ||
| 364 | __field( unsigned long, addr ) | ||
| 365 | ), | ||
| 366 | |||
| 367 | F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", | ||
| 368 | (void *)__entry->ip, (unsigned int)__entry->type, | ||
| 369 | (void *)__entry->addr, __entry->cmd) | ||
| 370 | ); | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f7c068..000e6e85b445 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -9,8 +9,6 @@ | |||
| 9 | #include <linux/kprobes.h> | 9 | #include <linux/kprobes.h> |
| 10 | #include "trace.h" | 10 | #include "trace.h" |
| 11 | 11 | ||
| 12 | EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); | ||
| 13 | |||
| 14 | static char *perf_trace_buf[4]; | 12 | static char *perf_trace_buf[4]; |
| 15 | 13 | ||
| 16 | /* | 14 | /* |
| @@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, | |||
| 56 | } | 54 | } |
| 57 | } | 55 | } |
| 58 | 56 | ||
| 59 | if (tp_event->class->reg) | 57 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); |
| 60 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); | ||
| 61 | else | ||
| 62 | ret = tracepoint_probe_register(tp_event->name, | ||
| 63 | tp_event->class->perf_probe, | ||
| 64 | tp_event); | ||
| 65 | |||
| 66 | if (ret) | 58 | if (ret) |
| 67 | goto fail; | 59 | goto fail; |
| 68 | 60 | ||
| @@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event) | |||
| 96 | mutex_lock(&event_mutex); | 88 | mutex_lock(&event_mutex); |
| 97 | list_for_each_entry(tp_event, &ftrace_events, list) { | 89 | list_for_each_entry(tp_event, &ftrace_events, list) { |
| 98 | if (tp_event->event.type == event_id && | 90 | if (tp_event->event.type == event_id && |
| 99 | tp_event->class && | 91 | tp_event->class && tp_event->class->reg && |
| 100 | (tp_event->class->perf_probe || | ||
| 101 | tp_event->class->reg) && | ||
| 102 | try_module_get(tp_event->mod)) { | 92 | try_module_get(tp_event->mod)) { |
| 103 | ret = perf_trace_event_init(tp_event, p_event); | 93 | ret = perf_trace_event_init(tp_event, p_event); |
| 104 | break; | 94 | break; |
| @@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event) | |||
| 138 | if (--tp_event->perf_refcount > 0) | 128 | if (--tp_event->perf_refcount > 0) |
| 139 | goto out; | 129 | goto out; |
| 140 | 130 | ||
| 141 | if (tp_event->class->reg) | 131 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); |
| 142 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); | ||
| 143 | else | ||
| 144 | tracepoint_probe_unregister(tp_event->name, | ||
| 145 | tp_event->class->perf_probe, | ||
| 146 | tp_event); | ||
| 147 | 132 | ||
| 148 | /* | 133 | /* |
| 149 | * Ensure our callback won't be called anymore. See | 134 | * Ensure our callback won't be called anymore. The buffers |
| 150 | * tracepoint_probe_unregister() and __DO_TRACE(). | 135 | * will be freed after that. |
| 151 | */ | 136 | */ |
| 152 | synchronize_sched(); | 137 | tracepoint_synchronize_unregister(); |
| 153 | 138 | ||
| 154 | free_percpu(tp_event->perf_events); | 139 | free_percpu(tp_event->perf_events); |
| 155 | tp_event->perf_events = NULL; | 140 | tp_event->perf_events = NULL; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b0801..09b4fa6e4d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | DEFINE_MUTEX(event_mutex); | 28 | DEFINE_MUTEX(event_mutex); |
| 29 | 29 | ||
| 30 | LIST_HEAD(ftrace_events); | 30 | LIST_HEAD(ftrace_events); |
| 31 | LIST_HEAD(ftrace_common_fields); | ||
| 31 | 32 | ||
| 32 | struct list_head * | 33 | struct list_head * |
| 33 | trace_get_fields(struct ftrace_event_call *event_call) | 34 | trace_get_fields(struct ftrace_event_call *event_call) |
| @@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) | |||
| 37 | return event_call->class->get_fields(event_call); | 38 | return event_call->class->get_fields(event_call); |
| 38 | } | 39 | } |
| 39 | 40 | ||
| 40 | int trace_define_field(struct ftrace_event_call *call, const char *type, | 41 | static int __trace_define_field(struct list_head *head, const char *type, |
| 41 | const char *name, int offset, int size, int is_signed, | 42 | const char *name, int offset, int size, |
| 42 | int filter_type) | 43 | int is_signed, int filter_type) |
| 43 | { | 44 | { |
| 44 | struct ftrace_event_field *field; | 45 | struct ftrace_event_field *field; |
| 45 | struct list_head *head; | ||
| 46 | |||
| 47 | if (WARN_ON(!call->class)) | ||
| 48 | return 0; | ||
| 49 | 46 | ||
| 50 | field = kzalloc(sizeof(*field), GFP_KERNEL); | 47 | field = kzalloc(sizeof(*field), GFP_KERNEL); |
| 51 | if (!field) | 48 | if (!field) |
| @@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, | |||
| 68 | field->size = size; | 65 | field->size = size; |
| 69 | field->is_signed = is_signed; | 66 | field->is_signed = is_signed; |
| 70 | 67 | ||
| 71 | head = trace_get_fields(call); | ||
| 72 | list_add(&field->link, head); | 68 | list_add(&field->link, head); |
| 73 | 69 | ||
| 74 | return 0; | 70 | return 0; |
| @@ -80,17 +76,32 @@ err: | |||
| 80 | 76 | ||
| 81 | return -ENOMEM; | 77 | return -ENOMEM; |
| 82 | } | 78 | } |
| 79 | |||
| 80 | int trace_define_field(struct ftrace_event_call *call, const char *type, | ||
| 81 | const char *name, int offset, int size, int is_signed, | ||
| 82 | int filter_type) | ||
| 83 | { | ||
| 84 | struct list_head *head; | ||
| 85 | |||
| 86 | if (WARN_ON(!call->class)) | ||
| 87 | return 0; | ||
| 88 | |||
| 89 | head = trace_get_fields(call); | ||
| 90 | return __trace_define_field(head, type, name, offset, size, | ||
| 91 | is_signed, filter_type); | ||
| 92 | } | ||
| 83 | EXPORT_SYMBOL_GPL(trace_define_field); | 93 | EXPORT_SYMBOL_GPL(trace_define_field); |
| 84 | 94 | ||
| 85 | #define __common_field(type, item) \ | 95 | #define __common_field(type, item) \ |
| 86 | ret = trace_define_field(call, #type, "common_" #item, \ | 96 | ret = __trace_define_field(&ftrace_common_fields, #type, \ |
| 87 | offsetof(typeof(ent), item), \ | 97 | "common_" #item, \ |
| 88 | sizeof(ent.item), \ | 98 | offsetof(typeof(ent), item), \ |
| 89 | is_signed_type(type), FILTER_OTHER); \ | 99 | sizeof(ent.item), \ |
| 100 | is_signed_type(type), FILTER_OTHER); \ | ||
| 90 | if (ret) \ | 101 | if (ret) \ |
| 91 | return ret; | 102 | return ret; |
| 92 | 103 | ||
| 93 | static int trace_define_common_fields(struct ftrace_event_call *call) | 104 | static int trace_define_common_fields(void) |
| 94 | { | 105 | { |
| 95 | int ret; | 106 | int ret; |
| 96 | struct trace_entry ent; | 107 | struct trace_entry ent; |
| @@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) | |||
| 130 | } | 141 | } |
| 131 | EXPORT_SYMBOL_GPL(trace_event_raw_init); | 142 | EXPORT_SYMBOL_GPL(trace_event_raw_init); |
| 132 | 143 | ||
| 144 | int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) | ||
| 145 | { | ||
| 146 | switch (type) { | ||
| 147 | case TRACE_REG_REGISTER: | ||
| 148 | return tracepoint_probe_register(call->name, | ||
| 149 | call->class->probe, | ||
| 150 | call); | ||
| 151 | case TRACE_REG_UNREGISTER: | ||
| 152 | tracepoint_probe_unregister(call->name, | ||
| 153 | call->class->probe, | ||
| 154 | call); | ||
| 155 | return 0; | ||
| 156 | |||
| 157 | #ifdef CONFIG_PERF_EVENTS | ||
| 158 | case TRACE_REG_PERF_REGISTER: | ||
| 159 | return tracepoint_probe_register(call->name, | ||
| 160 | call->class->perf_probe, | ||
| 161 | call); | ||
| 162 | case TRACE_REG_PERF_UNREGISTER: | ||
| 163 | tracepoint_probe_unregister(call->name, | ||
| 164 | call->class->perf_probe, | ||
| 165 | call); | ||
| 166 | return 0; | ||
| 167 | #endif | ||
| 168 | } | ||
| 169 | return 0; | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(ftrace_event_reg); | ||
| 172 | |||
| 173 | void trace_event_enable_cmd_record(bool enable) | ||
| 174 | { | ||
| 175 | struct ftrace_event_call *call; | ||
| 176 | |||
| 177 | mutex_lock(&event_mutex); | ||
| 178 | list_for_each_entry(call, &ftrace_events, list) { | ||
| 179 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) | ||
| 180 | continue; | ||
| 181 | |||
| 182 | if (enable) { | ||
| 183 | tracing_start_cmdline_record(); | ||
| 184 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; | ||
| 185 | } else { | ||
| 186 | tracing_stop_cmdline_record(); | ||
| 187 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | mutex_unlock(&event_mutex); | ||
| 191 | } | ||
| 192 | |||
| 133 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, | 193 | static int ftrace_event_enable_disable(struct ftrace_event_call *call, |
| 134 | int enable) | 194 | int enable) |
| 135 | { | 195 | { |
| @@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
| 139 | case 0: | 199 | case 0: |
| 140 | if (call->flags & TRACE_EVENT_FL_ENABLED) { | 200 | if (call->flags & TRACE_EVENT_FL_ENABLED) { |
| 141 | call->flags &= ~TRACE_EVENT_FL_ENABLED; | 201 | call->flags &= ~TRACE_EVENT_FL_ENABLED; |
| 142 | tracing_stop_cmdline_record(); | 202 | if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { |
| 143 | if (call->class->reg) | 203 | tracing_stop_cmdline_record(); |
| 144 | call->class->reg(call, TRACE_REG_UNREGISTER); | 204 | call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; |
| 145 | else | 205 | } |
| 146 | tracepoint_probe_unregister(call->name, | 206 | call->class->reg(call, TRACE_REG_UNREGISTER); |
| 147 | call->class->probe, | ||
| 148 | call); | ||
| 149 | } | 207 | } |
| 150 | break; | 208 | break; |
| 151 | case 1: | 209 | case 1: |
| 152 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { | 210 | if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { |
| 153 | tracing_start_cmdline_record(); | 211 | if (trace_flags & TRACE_ITER_RECORD_CMD) { |
| 154 | if (call->class->reg) | 212 | tracing_start_cmdline_record(); |
| 155 | ret = call->class->reg(call, TRACE_REG_REGISTER); | 213 | call->flags |= TRACE_EVENT_FL_RECORDED_CMD; |
| 156 | else | 214 | } |
| 157 | ret = tracepoint_probe_register(call->name, | 215 | ret = call->class->reg(call, TRACE_REG_REGISTER); |
| 158 | call->class->probe, | ||
| 159 | call); | ||
| 160 | if (ret) { | 216 | if (ret) { |
| 161 | tracing_stop_cmdline_record(); | 217 | tracing_stop_cmdline_record(); |
| 162 | pr_info("event trace: Could not enable event " | 218 | pr_info("event trace: Could not enable event " |
| @@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, | |||
| 194 | mutex_lock(&event_mutex); | 250 | mutex_lock(&event_mutex); |
| 195 | list_for_each_entry(call, &ftrace_events, list) { | 251 | list_for_each_entry(call, &ftrace_events, list) { |
| 196 | 252 | ||
| 197 | if (!call->name || !call->class || | 253 | if (!call->name || !call->class || !call->class->reg) |
| 198 | (!call->class->probe && !call->class->reg)) | ||
| 199 | continue; | 254 | continue; |
| 200 | 255 | ||
| 201 | if (match && | 256 | if (match && |
| @@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 321 | * The ftrace subsystem is for showing formats only. | 376 | * The ftrace subsystem is for showing formats only. |
| 322 | * They can not be enabled or disabled via the event files. | 377 | * They can not be enabled or disabled via the event files. |
| 323 | */ | 378 | */ |
| 324 | if (call->class && (call->class->probe || call->class->reg)) | 379 | if (call->class && call->class->reg) |
| 325 | return call; | 380 | return call; |
| 326 | } | 381 | } |
| 327 | 382 | ||
| @@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 474 | 529 | ||
| 475 | mutex_lock(&event_mutex); | 530 | mutex_lock(&event_mutex); |
| 476 | list_for_each_entry(call, &ftrace_events, list) { | 531 | list_for_each_entry(call, &ftrace_events, list) { |
| 477 | if (!call->name || !call->class || | 532 | if (!call->name || !call->class || !call->class->reg) |
| 478 | (!call->class->probe && !call->class->reg)) | ||
| 479 | continue; | 533 | continue; |
| 480 | 534 | ||
| 481 | if (system && strcmp(call->class->system, system) != 0) | 535 | if (system && strcmp(call->class->system, system) != 0) |
| @@ -544,32 +598,10 @@ out: | |||
| 544 | return ret; | 598 | return ret; |
| 545 | } | 599 | } |
| 546 | 600 | ||
| 547 | static ssize_t | 601 | static void print_event_fields(struct trace_seq *s, struct list_head *head) |
| 548 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | ||
| 549 | loff_t *ppos) | ||
| 550 | { | 602 | { |
| 551 | struct ftrace_event_call *call = filp->private_data; | ||
| 552 | struct ftrace_event_field *field; | 603 | struct ftrace_event_field *field; |
| 553 | struct list_head *head; | ||
| 554 | struct trace_seq *s; | ||
| 555 | int common_field_count = 5; | ||
| 556 | char *buf; | ||
| 557 | int r = 0; | ||
| 558 | |||
| 559 | if (*ppos) | ||
| 560 | return 0; | ||
| 561 | |||
| 562 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 563 | if (!s) | ||
| 564 | return -ENOMEM; | ||
| 565 | |||
| 566 | trace_seq_init(s); | ||
| 567 | |||
| 568 | trace_seq_printf(s, "name: %s\n", call->name); | ||
| 569 | trace_seq_printf(s, "ID: %d\n", call->event.type); | ||
| 570 | trace_seq_printf(s, "format:\n"); | ||
| 571 | 604 | ||
| 572 | head = trace_get_fields(call); | ||
| 573 | list_for_each_entry_reverse(field, head, link) { | 605 | list_for_each_entry_reverse(field, head, link) { |
| 574 | /* | 606 | /* |
| 575 | * Smartly shows the array type(except dynamic array). | 607 | * Smartly shows the array type(except dynamic array). |
| @@ -584,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 584 | array_descriptor = NULL; | 616 | array_descriptor = NULL; |
| 585 | 617 | ||
| 586 | if (!array_descriptor) { | 618 | if (!array_descriptor) { |
| 587 | r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" | 619 | trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" |
| 588 | "\tsize:%u;\tsigned:%d;\n", | 620 | "\tsize:%u;\tsigned:%d;\n", |
| 589 | field->type, field->name, field->offset, | 621 | field->type, field->name, field->offset, |
| 590 | field->size, !!field->is_signed); | 622 | field->size, !!field->is_signed); |
| 591 | } else { | 623 | } else { |
| 592 | r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" | 624 | trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" |
| 593 | "\tsize:%u;\tsigned:%d;\n", | 625 | "\tsize:%u;\tsigned:%d;\n", |
| 594 | (int)(array_descriptor - field->type), | 626 | (int)(array_descriptor - field->type), |
| 595 | field->type, field->name, | 627 | field->type, field->name, |
| 596 | array_descriptor, field->offset, | 628 | array_descriptor, field->offset, |
| 597 | field->size, !!field->is_signed); | 629 | field->size, !!field->is_signed); |
| 598 | } | 630 | } |
| 631 | } | ||
| 632 | } | ||
| 599 | 633 | ||
| 600 | if (--common_field_count == 0) | 634 | static ssize_t |
| 601 | r = trace_seq_printf(s, "\n"); | 635 | event_format_read(struct file *filp, char __user *ubuf, size_t cnt, |
| 636 | loff_t *ppos) | ||
| 637 | { | ||
| 638 | struct ftrace_event_call *call = filp->private_data; | ||
| 639 | struct list_head *head; | ||
| 640 | struct trace_seq *s; | ||
| 641 | char *buf; | ||
| 642 | int r; | ||
| 602 | 643 | ||
| 603 | if (!r) | 644 | if (*ppos) |
| 604 | break; | 645 | return 0; |
| 605 | } | 646 | |
| 647 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 648 | if (!s) | ||
| 649 | return -ENOMEM; | ||
| 650 | |||
| 651 | trace_seq_init(s); | ||
| 652 | |||
| 653 | trace_seq_printf(s, "name: %s\n", call->name); | ||
| 654 | trace_seq_printf(s, "ID: %d\n", call->event.type); | ||
| 655 | trace_seq_printf(s, "format:\n"); | ||
| 656 | |||
| 657 | /* print common fields */ | ||
| 658 | print_event_fields(s, &ftrace_common_fields); | ||
| 606 | 659 | ||
| 607 | if (r) | 660 | trace_seq_putc(s, '\n'); |
| 608 | r = trace_seq_printf(s, "\nprint fmt: %s\n", | 661 | |
| 609 | call->print_fmt); | 662 | /* print event specific fields */ |
| 663 | head = trace_get_fields(call); | ||
| 664 | print_event_fields(s, head); | ||
| 665 | |||
| 666 | r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); | ||
| 610 | 667 | ||
| 611 | if (!r) { | 668 | if (!r) { |
| 612 | /* | 669 | /* |
| @@ -963,35 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
| 963 | return -1; | 1020 | return -1; |
| 964 | } | 1021 | } |
| 965 | 1022 | ||
| 966 | if (call->class->probe || call->class->reg) | 1023 | if (call->class->reg) |
| 967 | trace_create_file("enable", 0644, call->dir, call, | 1024 | trace_create_file("enable", 0644, call->dir, call, |
| 968 | enable); | 1025 | enable); |
| 969 | 1026 | ||
| 970 | #ifdef CONFIG_PERF_EVENTS | 1027 | #ifdef CONFIG_PERF_EVENTS |
| 971 | if (call->event.type && (call->class->perf_probe || call->class->reg)) | 1028 | if (call->event.type && call->class->reg) |
| 972 | trace_create_file("id", 0444, call->dir, call, | 1029 | trace_create_file("id", 0444, call->dir, call, |
| 973 | id); | 1030 | id); |
| 974 | #endif | 1031 | #endif |
| 975 | 1032 | ||
| 976 | if (call->class->define_fields) { | 1033 | /* |
| 977 | /* | 1034 | * Other events may have the same class. Only update |
| 978 | * Other events may have the same class. Only update | 1035 | * the fields if they are not already defined. |
| 979 | * the fields if they are not already defined. | 1036 | */ |
| 980 | */ | 1037 | head = trace_get_fields(call); |
| 981 | head = trace_get_fields(call); | 1038 | if (list_empty(head)) { |
| 982 | if (list_empty(head)) { | 1039 | ret = call->class->define_fields(call); |
| 983 | ret = trace_define_common_fields(call); | 1040 | if (ret < 0) { |
| 984 | if (!ret) | 1041 | pr_warning("Could not initialize trace point" |
| 985 | ret = call->class->define_fields(call); | 1042 | " events/%s\n", call->name); |
| 986 | if (ret < 0) { | 1043 | return ret; |
| 987 | pr_warning("Could not initialize trace point" | ||
| 988 | " events/%s\n", call->name); | ||
| 989 | return ret; | ||
| 990 | } | ||
| 991 | } | 1044 | } |
| 992 | trace_create_file("filter", 0644, call->dir, call, | ||
| 993 | filter); | ||
| 994 | } | 1045 | } |
| 1046 | trace_create_file("filter", 0644, call->dir, call, | ||
| 1047 | filter); | ||
| 995 | 1048 | ||
| 996 | trace_create_file("format", 0444, call->dir, call, | 1049 | trace_create_file("format", 0444, call->dir, call, |
| 997 | format); | 1050 | format); |
| @@ -999,11 +1052,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
| 999 | return 0; | 1052 | return 0; |
| 1000 | } | 1053 | } |
| 1001 | 1054 | ||
| 1002 | static int __trace_add_event_call(struct ftrace_event_call *call) | 1055 | static int |
| 1056 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | ||
| 1057 | const struct file_operations *id, | ||
| 1058 | const struct file_operations *enable, | ||
| 1059 | const struct file_operations *filter, | ||
| 1060 | const struct file_operations *format) | ||
| 1003 | { | 1061 | { |
| 1004 | struct dentry *d_events; | 1062 | struct dentry *d_events; |
| 1005 | int ret; | 1063 | int ret; |
| 1006 | 1064 | ||
| 1065 | /* The linker may leave blanks */ | ||
| 1007 | if (!call->name) | 1066 | if (!call->name) |
| 1008 | return -EINVAL; | 1067 | return -EINVAL; |
| 1009 | 1068 | ||
| @@ -1011,8 +1070,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
| 1011 | ret = call->class->raw_init(call); | 1070 | ret = call->class->raw_init(call); |
| 1012 | if (ret < 0) { | 1071 | if (ret < 0) { |
| 1013 | if (ret != -ENOSYS) | 1072 | if (ret != -ENOSYS) |
| 1014 | pr_warning("Could not initialize trace " | 1073 | pr_warning("Could not initialize trace events/%s\n", |
| 1015 | "events/%s\n", call->name); | 1074 | call->name); |
| 1016 | return ret; | 1075 | return ret; |
| 1017 | } | 1076 | } |
| 1018 | } | 1077 | } |
| @@ -1021,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) | |||
| 1021 | if (!d_events) | 1080 | if (!d_events) |
| 1022 | return -ENOENT; | 1081 | return -ENOENT; |
| 1023 | 1082 | ||
| 1024 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | 1083 | ret = event_create_dir(call, d_events, id, enable, filter, format); |
| 1025 | &ftrace_enable_fops, &ftrace_event_filter_fops, | ||
| 1026 | &ftrace_event_format_fops); | ||
| 1027 | if (!ret) | 1084 | if (!ret) |
| 1028 | list_add(&call->list, &ftrace_events); | 1085 | list_add(&call->list, &ftrace_events); |
| 1086 | call->mod = mod; | ||
| 1029 | 1087 | ||
| 1030 | return ret; | 1088 | return ret; |
| 1031 | } | 1089 | } |
| @@ -1035,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
| 1035 | { | 1093 | { |
| 1036 | int ret; | 1094 | int ret; |
| 1037 | mutex_lock(&event_mutex); | 1095 | mutex_lock(&event_mutex); |
| 1038 | ret = __trace_add_event_call(call); | 1096 | ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
| 1097 | &ftrace_enable_fops, | ||
| 1098 | &ftrace_event_filter_fops, | ||
| 1099 | &ftrace_event_format_fops); | ||
| 1039 | mutex_unlock(&event_mutex); | 1100 | mutex_unlock(&event_mutex); |
| 1040 | return ret; | 1101 | return ret; |
| 1041 | } | 1102 | } |
| @@ -1152,8 +1213,6 @@ static void trace_module_add_events(struct module *mod) | |||
| 1152 | { | 1213 | { |
| 1153 | struct ftrace_module_file_ops *file_ops = NULL; | 1214 | struct ftrace_module_file_ops *file_ops = NULL; |
| 1154 | struct ftrace_event_call *call, *start, *end; | 1215 | struct ftrace_event_call *call, *start, *end; |
| 1155 | struct dentry *d_events; | ||
| 1156 | int ret; | ||
| 1157 | 1216 | ||
| 1158 | start = mod->trace_events; | 1217 | start = mod->trace_events; |
| 1159 | end = mod->trace_events + mod->num_trace_events; | 1218 | end = mod->trace_events + mod->num_trace_events; |
| @@ -1161,38 +1220,14 @@ static void trace_module_add_events(struct module *mod) | |||
| 1161 | if (start == end) | 1220 | if (start == end) |
| 1162 | return; | 1221 | return; |
| 1163 | 1222 | ||
| 1164 | d_events = event_trace_events_dir(); | 1223 | file_ops = trace_create_file_ops(mod); |
| 1165 | if (!d_events) | 1224 | if (!file_ops) |
| 1166 | return; | 1225 | return; |
| 1167 | 1226 | ||
| 1168 | for_each_event(call, start, end) { | 1227 | for_each_event(call, start, end) { |
| 1169 | /* The linker may leave blanks */ | 1228 | __trace_add_event_call(call, mod, |
| 1170 | if (!call->name) | ||
| 1171 | continue; | ||
| 1172 | if (call->class->raw_init) { | ||
| 1173 | ret = call->class->raw_init(call); | ||
| 1174 | if (ret < 0) { | ||
| 1175 | if (ret != -ENOSYS) | ||
| 1176 | pr_warning("Could not initialize trace " | ||
| 1177 | "point events/%s\n", call->name); | ||
| 1178 | continue; | ||
| 1179 | } | ||
| 1180 | } | ||
| 1181 | /* | ||
| 1182 | * This module has events, create file ops for this module | ||
| 1183 | * if not already done. | ||
| 1184 | */ | ||
| 1185 | if (!file_ops) { | ||
| 1186 | file_ops = trace_create_file_ops(mod); | ||
| 1187 | if (!file_ops) | ||
| 1188 | return; | ||
| 1189 | } | ||
| 1190 | call->mod = mod; | ||
| 1191 | ret = event_create_dir(call, d_events, | ||
| 1192 | &file_ops->id, &file_ops->enable, | 1229 | &file_ops->id, &file_ops->enable, |
| 1193 | &file_ops->filter, &file_ops->format); | 1230 | &file_ops->filter, &file_ops->format); |
| 1194 | if (!ret) | ||
| 1195 | list_add(&call->list, &ftrace_events); | ||
| 1196 | } | 1231 | } |
| 1197 | } | 1232 | } |
| 1198 | 1233 | ||
| @@ -1319,25 +1354,14 @@ static __init int event_trace_init(void) | |||
| 1319 | trace_create_file("enable", 0644, d_events, | 1354 | trace_create_file("enable", 0644, d_events, |
| 1320 | NULL, &ftrace_system_enable_fops); | 1355 | NULL, &ftrace_system_enable_fops); |
| 1321 | 1356 | ||
| 1357 | if (trace_define_common_fields()) | ||
| 1358 | pr_warning("tracing: Failed to allocate common fields"); | ||
| 1359 | |||
| 1322 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1360 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { |
| 1323 | /* The linker may leave blanks */ | 1361 | __trace_add_event_call(call, NULL, &ftrace_event_id_fops, |
| 1324 | if (!call->name) | ||
| 1325 | continue; | ||
| 1326 | if (call->class->raw_init) { | ||
| 1327 | ret = call->class->raw_init(call); | ||
| 1328 | if (ret < 0) { | ||
| 1329 | if (ret != -ENOSYS) | ||
| 1330 | pr_warning("Could not initialize trace " | ||
| 1331 | "point events/%s\n", call->name); | ||
| 1332 | continue; | ||
| 1333 | } | ||
| 1334 | } | ||
| 1335 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, | ||
| 1336 | &ftrace_enable_fops, | 1362 | &ftrace_enable_fops, |
| 1337 | &ftrace_event_filter_fops, | 1363 | &ftrace_event_filter_fops, |
| 1338 | &ftrace_event_format_fops); | 1364 | &ftrace_event_format_fops); |
| 1339 | if (!ret) | ||
| 1340 | list_add(&call->list, &ftrace_events); | ||
| 1341 | } | 1365 | } |
| 1342 | 1366 | ||
| 1343 | while (true) { | 1367 | while (true) { |
| @@ -1524,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
| 1524 | struct ftrace_entry *entry; | 1548 | struct ftrace_entry *entry; |
| 1525 | unsigned long flags; | 1549 | unsigned long flags; |
| 1526 | long disabled; | 1550 | long disabled; |
| 1527 | int resched; | ||
| 1528 | int cpu; | 1551 | int cpu; |
| 1529 | int pc; | 1552 | int pc; |
| 1530 | 1553 | ||
| 1531 | pc = preempt_count(); | 1554 | pc = preempt_count(); |
| 1532 | resched = ftrace_preempt_disable(); | 1555 | preempt_disable_notrace(); |
| 1533 | cpu = raw_smp_processor_id(); | 1556 | cpu = raw_smp_processor_id(); |
| 1534 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); | 1557 | disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); |
| 1535 | 1558 | ||
| @@ -1551,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
| 1551 | 1574 | ||
| 1552 | out: | 1575 | out: |
| 1553 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); | 1576 | atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); |
| 1554 | ftrace_preempt_enable(resched); | 1577 | preempt_enable_notrace(); |
| 1555 | } | 1578 | } |
| 1556 | 1579 | ||
| 1557 | static struct ftrace_ops trace_ops __initdata = | 1580 | static struct ftrace_ops trace_ops __initdata = |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb32999..36d40104b17f 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
| 497 | } | 497 | } |
| 498 | 498 | ||
| 499 | static struct ftrace_event_field * | 499 | static struct ftrace_event_field * |
| 500 | find_event_field(struct ftrace_event_call *call, char *name) | 500 | __find_event_field(struct list_head *head, char *name) |
| 501 | { | 501 | { |
| 502 | struct ftrace_event_field *field; | 502 | struct ftrace_event_field *field; |
| 503 | struct list_head *head; | ||
| 504 | 503 | ||
| 505 | head = trace_get_fields(call); | ||
| 506 | list_for_each_entry(field, head, link) { | 504 | list_for_each_entry(field, head, link) { |
| 507 | if (!strcmp(field->name, name)) | 505 | if (!strcmp(field->name, name)) |
| 508 | return field; | 506 | return field; |
| @@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
| 511 | return NULL; | 509 | return NULL; |
| 512 | } | 510 | } |
| 513 | 511 | ||
| 512 | static struct ftrace_event_field * | ||
| 513 | find_event_field(struct ftrace_event_call *call, char *name) | ||
| 514 | { | ||
| 515 | struct ftrace_event_field *field; | ||
| 516 | struct list_head *head; | ||
| 517 | |||
| 518 | field = __find_event_field(&ftrace_common_fields, name); | ||
| 519 | if (field) | ||
| 520 | return field; | ||
| 521 | |||
| 522 | head = trace_get_fields(call); | ||
| 523 | return __find_event_field(head, name); | ||
| 524 | } | ||
| 525 | |||
| 514 | static void filter_free_pred(struct filter_pred *pred) | 526 | static void filter_free_pred(struct filter_pred *pred) |
| 515 | { | 527 | { |
| 516 | if (!pred) | 528 | if (!pred) |
| @@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) | |||
| 627 | int err; | 639 | int err; |
| 628 | 640 | ||
| 629 | list_for_each_entry(call, &ftrace_events, list) { | 641 | list_for_each_entry(call, &ftrace_events, list) { |
| 630 | if (!call->class || !call->class->define_fields) | ||
| 631 | continue; | ||
| 632 | |||
| 633 | if (strcmp(call->class->system, system->name) != 0) | 642 | if (strcmp(call->class->system, system->name) != 0) |
| 634 | continue; | 643 | continue; |
| 635 | 644 | ||
| @@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) | |||
| 646 | struct ftrace_event_call *call; | 655 | struct ftrace_event_call *call; |
| 647 | 656 | ||
| 648 | list_for_each_entry(call, &ftrace_events, list) { | 657 | list_for_each_entry(call, &ftrace_events, list) { |
| 649 | if (!call->class || !call->class->define_fields) | ||
| 650 | continue; | ||
| 651 | |||
| 652 | if (strcmp(call->class->system, system->name) != 0) | 658 | if (strcmp(call->class->system, system->name) != 0) |
| 653 | continue; | 659 | continue; |
| 654 | 660 | ||
| @@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, | |||
| 1251 | list_for_each_entry(call, &ftrace_events, list) { | 1257 | list_for_each_entry(call, &ftrace_events, list) { |
| 1252 | struct event_filter *filter = call->filter; | 1258 | struct event_filter *filter = call->filter; |
| 1253 | 1259 | ||
| 1254 | if (!call->class || !call->class->define_fields) | ||
| 1255 | continue; | ||
| 1256 | |||
| 1257 | if (strcmp(call->class->system, system->name) != 0) | 1260 | if (strcmp(call->class->system, system->name) != 0) |
| 1258 | continue; | 1261 | continue; |
| 1259 | 1262 | ||
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a65969..4ba44deaac25 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
| @@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
| 125 | 125 | ||
| 126 | #include "trace_entries.h" | 126 | #include "trace_entries.h" |
| 127 | 127 | ||
| 128 | static int ftrace_raw_init_event(struct ftrace_event_call *call) | ||
| 129 | { | ||
| 130 | INIT_LIST_HEAD(&call->class->fields); | ||
| 131 | return 0; | ||
| 132 | } | ||
| 133 | |||
| 134 | #undef __entry | 128 | #undef __entry |
| 135 | #define __entry REC | 129 | #define __entry REC |
| 136 | 130 | ||
| @@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) | |||
| 158 | struct ftrace_event_class event_class_ftrace_##call = { \ | 152 | struct ftrace_event_class event_class_ftrace_##call = { \ |
| 159 | .system = __stringify(TRACE_SYSTEM), \ | 153 | .system = __stringify(TRACE_SYSTEM), \ |
| 160 | .define_fields = ftrace_define_fields_##call, \ | 154 | .define_fields = ftrace_define_fields_##call, \ |
| 161 | .raw_init = ftrace_raw_init_event, \ | 155 | .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ |
| 162 | }; \ | 156 | }; \ |
| 163 | \ | 157 | \ |
| 164 | struct ftrace_event_call __used \ | 158 | struct ftrace_event_call __used \ |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd6..16aee4d44e8f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
| 54 | struct trace_array_cpu *data; | 54 | struct trace_array_cpu *data; |
| 55 | unsigned long flags; | 55 | unsigned long flags; |
| 56 | long disabled; | 56 | long disabled; |
| 57 | int cpu, resched; | 57 | int cpu; |
| 58 | int pc; | 58 | int pc; |
| 59 | 59 | ||
| 60 | if (unlikely(!ftrace_function_enabled)) | 60 | if (unlikely(!ftrace_function_enabled)) |
| 61 | return; | 61 | return; |
| 62 | 62 | ||
| 63 | pc = preempt_count(); | 63 | pc = preempt_count(); |
| 64 | resched = ftrace_preempt_disable(); | 64 | preempt_disable_notrace(); |
| 65 | local_save_flags(flags); | 65 | local_save_flags(flags); |
| 66 | cpu = raw_smp_processor_id(); | 66 | cpu = raw_smp_processor_id(); |
| 67 | data = tr->data[cpu]; | 67 | data = tr->data[cpu]; |
| @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
| 71 | trace_function(tr, ip, parent_ip, flags, pc); | 71 | trace_function(tr, ip, parent_ip, flags, pc); |
| 72 | 72 | ||
| 73 | atomic_dec(&data->disabled); | 73 | atomic_dec(&data->disabled); |
| 74 | ftrace_preempt_enable(resched); | 74 | preempt_enable_notrace(); |
| 75 | } | 75 | } |
| 76 | 76 | ||
| 77 | static void | 77 | static void |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a94..6bff23625781 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
| 641 | 641 | ||
| 642 | /* Print nsecs (we don't want to exceed 7 numbers) */ | 642 | /* Print nsecs (we don't want to exceed 7 numbers) */ |
| 643 | if (len < 7) { | 643 | if (len < 7) { |
| 644 | snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); | 644 | snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", |
| 645 | nsecs_rem); | ||
| 645 | ret = trace_seq_printf(s, ".%s", nsecs_str); | 646 | ret = trace_seq_printf(s, ".%s", nsecs_str); |
| 646 | if (!ret) | 647 | if (!ret) |
| 647 | return TRACE_TYPE_PARTIAL_LINE; | 648 | return TRACE_TYPE_PARTIAL_LINE; |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef4..73a6b0601f2e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
| 649 | #endif | 649 | #endif |
| 650 | .open = irqsoff_trace_open, | 650 | .open = irqsoff_trace_open, |
| 651 | .close = irqsoff_trace_close, | 651 | .close = irqsoff_trace_close, |
| 652 | .use_max_tr = 1, | ||
| 652 | }; | 653 | }; |
| 653 | # define register_irqsoff(trace) register_tracer(&trace) | 654 | # define register_irqsoff(trace) register_tracer(&trace) |
| 654 | #else | 655 | #else |
| @@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
| 681 | #endif | 682 | #endif |
| 682 | .open = irqsoff_trace_open, | 683 | .open = irqsoff_trace_open, |
| 683 | .close = irqsoff_trace_close, | 684 | .close = irqsoff_trace_close, |
| 685 | .use_max_tr = 1, | ||
| 684 | }; | 686 | }; |
| 685 | # define register_preemptoff(trace) register_tracer(&trace) | 687 | # define register_preemptoff(trace) register_tracer(&trace) |
| 686 | #else | 688 | #else |
| @@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
| 715 | #endif | 717 | #endif |
| 716 | .open = irqsoff_trace_open, | 718 | .open = irqsoff_trace_open, |
| 717 | .close = irqsoff_trace_close, | 719 | .close = irqsoff_trace_close, |
| 720 | .use_max_tr = 1, | ||
| 718 | }; | 721 | }; |
| 719 | 722 | ||
| 720 | # define register_preemptirqsoff(trace) register_tracer(&trace) | 723 | # define register_preemptirqsoff(trace) register_tracer(&trace) |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299d..8b27c9849b42 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -30,6 +30,8 @@ | |||
| 30 | #include <linux/ptrace.h> | 30 | #include <linux/ptrace.h> |
| 31 | #include <linux/perf_event.h> | 31 | #include <linux/perf_event.h> |
| 32 | #include <linux/stringify.h> | 32 | #include <linux/stringify.h> |
| 33 | #include <linux/limits.h> | ||
| 34 | #include <linux/uaccess.h> | ||
| 33 | #include <asm/bitsperlong.h> | 35 | #include <asm/bitsperlong.h> |
| 34 | 36 | ||
| 35 | #include "trace.h" | 37 | #include "trace.h" |
| @@ -38,6 +40,7 @@ | |||
| 38 | #define MAX_TRACE_ARGS 128 | 40 | #define MAX_TRACE_ARGS 128 |
| 39 | #define MAX_ARGSTR_LEN 63 | 41 | #define MAX_ARGSTR_LEN 63 |
| 40 | #define MAX_EVENT_NAME_LEN 64 | 42 | #define MAX_EVENT_NAME_LEN 64 |
| 43 | #define MAX_STRING_SIZE PATH_MAX | ||
| 41 | #define KPROBE_EVENT_SYSTEM "kprobes" | 44 | #define KPROBE_EVENT_SYSTEM "kprobes" |
| 42 | 45 | ||
| 43 | /* Reserved field names */ | 46 | /* Reserved field names */ |
| @@ -58,14 +61,16 @@ const char *reserved_field_names[] = { | |||
| 58 | }; | 61 | }; |
| 59 | 62 | ||
| 60 | /* Printing function type */ | 63 | /* Printing function type */ |
| 61 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); | 64 | typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, |
| 65 | void *); | ||
| 62 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type | 66 | #define PRINT_TYPE_FUNC_NAME(type) print_type_##type |
| 63 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type | 67 | #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type |
| 64 | 68 | ||
| 65 | /* Printing in basic type function template */ | 69 | /* Printing in basic type function template */ |
| 66 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ | 70 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ |
| 67 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 71 | static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ |
| 68 | const char *name, void *data)\ | 72 | const char *name, \ |
| 73 | void *data, void *ent)\ | ||
| 69 | { \ | 74 | { \ |
| 70 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ | 75 | return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ |
| 71 | } \ | 76 | } \ |
| @@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) | |||
| 80 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) | 85 | DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) |
| 81 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) | 86 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) |
| 82 | 87 | ||
| 88 | /* data_rloc: data relative location, compatible with u32 */ | ||
| 89 | #define make_data_rloc(len, roffs) \ | ||
| 90 | (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) | ||
| 91 | #define get_rloc_len(dl) ((u32)(dl) >> 16) | ||
| 92 | #define get_rloc_offs(dl) ((u32)(dl) & 0xffff) | ||
| 93 | |||
| 94 | static inline void *get_rloc_data(u32 *dl) | ||
| 95 | { | ||
| 96 | return (u8 *)dl + get_rloc_offs(*dl); | ||
| 97 | } | ||
| 98 | |||
| 99 | /* For data_loc conversion */ | ||
| 100 | static inline void *get_loc_data(u32 *dl, void *ent) | ||
| 101 | { | ||
| 102 | return (u8 *)ent + get_rloc_offs(*dl); | ||
| 103 | } | ||
| 104 | |||
| 105 | /* | ||
| 106 | * Convert data_rloc to data_loc: | ||
| 107 | * data_rloc stores the offset from data_rloc itself, but data_loc | ||
| 108 | * stores the offset from event entry. | ||
| 109 | */ | ||
| 110 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | ||
| 111 | |||
| 112 | /* For defining macros, define string/string_size types */ | ||
| 113 | typedef u32 string; | ||
| 114 | typedef u32 string_size; | ||
| 115 | |||
| 116 | /* Print type function for string type */ | ||
| 117 | static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | ||
| 118 | const char *name, | ||
| 119 | void *data, void *ent) | ||
| 120 | { | ||
| 121 | int len = *(u32 *)data >> 16; | ||
| 122 | |||
| 123 | if (!len) | ||
| 124 | return trace_seq_printf(s, " %s=(fault)", name); | ||
| 125 | else | ||
| 126 | return trace_seq_printf(s, " %s=\"%s\"", name, | ||
| 127 | (const char *)get_loc_data(data, ent)); | ||
| 128 | } | ||
| 129 | static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | ||
| 130 | |||
| 83 | /* Data fetch function type */ | 131 | /* Data fetch function type */ |
| 84 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); | 132 | typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); |
| 85 | 133 | ||
| @@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, | |||
| 94 | return fprm->fn(regs, fprm->data, dest); | 142 | return fprm->fn(regs, fprm->data, dest); |
| 95 | } | 143 | } |
| 96 | 144 | ||
| 97 | #define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type | 145 | #define FETCH_FUNC_NAME(method, type) fetch_##method##_##type |
| 98 | /* | 146 | /* |
| 99 | * Define macro for basic types - we don't need to define s* types, because | 147 | * Define macro for basic types - we don't need to define s* types, because |
| 100 | * we have to care only about bitwidth at recording time. | 148 | * we have to care only about bitwidth at recording time. |
| 101 | */ | 149 | */ |
| 102 | #define DEFINE_BASIC_FETCH_FUNCS(kind) \ | 150 | #define DEFINE_BASIC_FETCH_FUNCS(method) \ |
| 103 | DEFINE_FETCH_##kind(u8) \ | 151 | DEFINE_FETCH_##method(u8) \ |
| 104 | DEFINE_FETCH_##kind(u16) \ | 152 | DEFINE_FETCH_##method(u16) \ |
| 105 | DEFINE_FETCH_##kind(u32) \ | 153 | DEFINE_FETCH_##method(u32) \ |
| 106 | DEFINE_FETCH_##kind(u64) | 154 | DEFINE_FETCH_##method(u64) |
| 107 | 155 | ||
| 108 | #define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ | 156 | #define CHECK_FETCH_FUNCS(method, fn) \ |
| 109 | ((FETCH_FUNC_NAME(kind, u8) == fn) || \ | 157 | (((FETCH_FUNC_NAME(method, u8) == fn) || \ |
| 110 | (FETCH_FUNC_NAME(kind, u16) == fn) || \ | 158 | (FETCH_FUNC_NAME(method, u16) == fn) || \ |
| 111 | (FETCH_FUNC_NAME(kind, u32) == fn) || \ | 159 | (FETCH_FUNC_NAME(method, u32) == fn) || \ |
| 112 | (FETCH_FUNC_NAME(kind, u64) == fn)) | 160 | (FETCH_FUNC_NAME(method, u64) == fn) || \ |
| 161 | (FETCH_FUNC_NAME(method, string) == fn) || \ | ||
| 162 | (FETCH_FUNC_NAME(method, string_size) == fn)) \ | ||
| 163 | && (fn != NULL)) | ||
| 113 | 164 | ||
| 114 | /* Data fetch function templates */ | 165 | /* Data fetch function templates */ |
| 115 | #define DEFINE_FETCH_reg(type) \ | 166 | #define DEFINE_FETCH_reg(type) \ |
| 116 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | 167 | static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ |
| 117 | void *offset, void *dest) \ | 168 | void *offset, void *dest) \ |
| 118 | { \ | 169 | { \ |
| 119 | *(type *)dest = (type)regs_get_register(regs, \ | 170 | *(type *)dest = (type)regs_get_register(regs, \ |
| 120 | (unsigned int)((unsigned long)offset)); \ | 171 | (unsigned int)((unsigned long)offset)); \ |
| 121 | } | 172 | } |
| 122 | DEFINE_BASIC_FETCH_FUNCS(reg) | 173 | DEFINE_BASIC_FETCH_FUNCS(reg) |
| 174 | /* No string on the register */ | ||
| 175 | #define fetch_reg_string NULL | ||
| 176 | #define fetch_reg_string_size NULL | ||
| 123 | 177 | ||
| 124 | #define DEFINE_FETCH_stack(type) \ | 178 | #define DEFINE_FETCH_stack(type) \ |
| 125 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | 179 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ |
| @@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | |||
| 129 | (unsigned int)((unsigned long)offset)); \ | 183 | (unsigned int)((unsigned long)offset)); \ |
| 130 | } | 184 | } |
| 131 | DEFINE_BASIC_FETCH_FUNCS(stack) | 185 | DEFINE_BASIC_FETCH_FUNCS(stack) |
| 186 | /* No string on the stack entry */ | ||
| 187 | #define fetch_stack_string NULL | ||
| 188 | #define fetch_stack_string_size NULL | ||
| 132 | 189 | ||
| 133 | #define DEFINE_FETCH_retval(type) \ | 190 | #define DEFINE_FETCH_retval(type) \ |
| 134 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | 191 | static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ |
| @@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ | |||
| 137 | *(type *)dest = (type)regs_return_value(regs); \ | 194 | *(type *)dest = (type)regs_return_value(regs); \ |
| 138 | } | 195 | } |
| 139 | DEFINE_BASIC_FETCH_FUNCS(retval) | 196 | DEFINE_BASIC_FETCH_FUNCS(retval) |
| 197 | /* No string on the retval */ | ||
| 198 | #define fetch_retval_string NULL | ||
| 199 | #define fetch_retval_string_size NULL | ||
| 140 | 200 | ||
| 141 | #define DEFINE_FETCH_memory(type) \ | 201 | #define DEFINE_FETCH_memory(type) \ |
| 142 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | 202 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ |
| @@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | |||
| 149 | *(type *)dest = retval; \ | 209 | *(type *)dest = retval; \ |
| 150 | } | 210 | } |
| 151 | DEFINE_BASIC_FETCH_FUNCS(memory) | 211 | DEFINE_BASIC_FETCH_FUNCS(memory) |
| 212 | /* | ||
| 213 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | ||
| 214 | * length and relative data location. | ||
| 215 | */ | ||
| 216 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | ||
| 217 | void *addr, void *dest) | ||
| 218 | { | ||
| 219 | long ret; | ||
| 220 | int maxlen = get_rloc_len(*(u32 *)dest); | ||
| 221 | u8 *dst = get_rloc_data(dest); | ||
| 222 | u8 *src = addr; | ||
| 223 | mm_segment_t old_fs = get_fs(); | ||
| 224 | if (!maxlen) | ||
| 225 | return; | ||
| 226 | /* | ||
| 227 | * Try to get string again, since the string can be changed while | ||
| 228 | * probing. | ||
| 229 | */ | ||
| 230 | set_fs(KERNEL_DS); | ||
| 231 | pagefault_disable(); | ||
| 232 | do | ||
| 233 | ret = __copy_from_user_inatomic(dst++, src++, 1); | ||
| 234 | while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); | ||
| 235 | dst[-1] = '\0'; | ||
| 236 | pagefault_enable(); | ||
| 237 | set_fs(old_fs); | ||
| 238 | |||
| 239 | if (ret < 0) { /* Failed to fetch string */ | ||
| 240 | ((u8 *)get_rloc_data(dest))[0] = '\0'; | ||
| 241 | *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); | ||
| 242 | } else | ||
| 243 | *(u32 *)dest = make_data_rloc(src - (u8 *)addr, | ||
| 244 | get_rloc_offs(*(u32 *)dest)); | ||
| 245 | } | ||
| 246 | /* Return the length of string -- including null terminal byte */ | ||
| 247 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | ||
| 248 | void *addr, void *dest) | ||
| 249 | { | ||
| 250 | int ret, len = 0; | ||
| 251 | u8 c; | ||
| 252 | mm_segment_t old_fs = get_fs(); | ||
| 253 | |||
| 254 | set_fs(KERNEL_DS); | ||
| 255 | pagefault_disable(); | ||
| 256 | do { | ||
| 257 | ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); | ||
| 258 | len++; | ||
| 259 | } while (c && ret == 0 && len < MAX_STRING_SIZE); | ||
| 260 | pagefault_enable(); | ||
| 261 | set_fs(old_fs); | ||
| 262 | |||
| 263 | if (ret < 0) /* Failed to check the length */ | ||
| 264 | *(u32 *)dest = 0; | ||
| 265 | else | ||
| 266 | *(u32 *)dest = len; | ||
| 267 | } | ||
| 152 | 268 | ||
| 153 | /* Memory fetching by symbol */ | 269 | /* Memory fetching by symbol */ |
| 154 | struct symbol_cache { | 270 | struct symbol_cache { |
| @@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ | |||
| 203 | *(type *)dest = 0; \ | 319 | *(type *)dest = 0; \ |
| 204 | } | 320 | } |
| 205 | DEFINE_BASIC_FETCH_FUNCS(symbol) | 321 | DEFINE_BASIC_FETCH_FUNCS(symbol) |
| 322 | DEFINE_FETCH_symbol(string) | ||
| 323 | DEFINE_FETCH_symbol(string_size) | ||
| 206 | 324 | ||
| 207 | /* Dereference memory access function */ | 325 | /* Dereference memory access function */ |
| 208 | struct deref_fetch_param { | 326 | struct deref_fetch_param { |
| @@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ | |||
| 224 | *(type *)dest = 0; \ | 342 | *(type *)dest = 0; \ |
| 225 | } | 343 | } |
| 226 | DEFINE_BASIC_FETCH_FUNCS(deref) | 344 | DEFINE_BASIC_FETCH_FUNCS(deref) |
| 345 | DEFINE_FETCH_deref(string) | ||
| 346 | DEFINE_FETCH_deref(string_size) | ||
| 227 | 347 | ||
| 228 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 348 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) |
| 229 | { | 349 | { |
| 230 | if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) | 350 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
| 231 | free_deref_fetch_param(data->orig.data); | 351 | free_deref_fetch_param(data->orig.data); |
| 232 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) | 352 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) |
| 233 | free_symbol_cache(data->orig.data); | 353 | free_symbol_cache(data->orig.data); |
| 234 | kfree(data); | 354 | kfree(data); |
| 235 | } | 355 | } |
| @@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
| 240 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) | 360 | #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) |
| 241 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) | 361 | #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) |
| 242 | 362 | ||
| 243 | #define ASSIGN_FETCH_FUNC(kind, type) \ | 363 | /* Fetch types */ |
| 244 | .kind = FETCH_FUNC_NAME(kind, type) | 364 | enum { |
| 245 | 365 | FETCH_MTD_reg = 0, | |
| 246 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | 366 | FETCH_MTD_stack, |
| 247 | {.name = #ptype, \ | 367 | FETCH_MTD_retval, |
| 248 | .size = sizeof(ftype), \ | 368 | FETCH_MTD_memory, |
| 249 | .is_signed = sign, \ | 369 | FETCH_MTD_symbol, |
| 250 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | 370 | FETCH_MTD_deref, |
| 251 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | 371 | FETCH_MTD_END, |
| 252 | ASSIGN_FETCH_FUNC(reg, ftype), \ | 372 | }; |
| 253 | ASSIGN_FETCH_FUNC(stack, ftype), \ | 373 | |
| 254 | ASSIGN_FETCH_FUNC(retval, ftype), \ | 374 | #define ASSIGN_FETCH_FUNC(method, type) \ |
| 255 | ASSIGN_FETCH_FUNC(memory, ftype), \ | 375 | [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) |
| 256 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | 376 | |
| 257 | ASSIGN_FETCH_FUNC(deref, ftype), \ | 377 | #define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ |
| 378 | {.name = _name, \ | ||
| 379 | .size = _size, \ | ||
| 380 | .is_signed = sign, \ | ||
| 381 | .print = PRINT_TYPE_FUNC_NAME(ptype), \ | ||
| 382 | .fmt = PRINT_TYPE_FMT_NAME(ptype), \ | ||
| 383 | .fmttype = _fmttype, \ | ||
| 384 | .fetch = { \ | ||
| 385 | ASSIGN_FETCH_FUNC(reg, ftype), \ | ||
| 386 | ASSIGN_FETCH_FUNC(stack, ftype), \ | ||
| 387 | ASSIGN_FETCH_FUNC(retval, ftype), \ | ||
| 388 | ASSIGN_FETCH_FUNC(memory, ftype), \ | ||
| 389 | ASSIGN_FETCH_FUNC(symbol, ftype), \ | ||
| 390 | ASSIGN_FETCH_FUNC(deref, ftype), \ | ||
| 391 | } \ | ||
| 258 | } | 392 | } |
| 259 | 393 | ||
| 394 | #define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ | ||
| 395 | __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) | ||
| 396 | |||
| 397 | #define FETCH_TYPE_STRING 0 | ||
| 398 | #define FETCH_TYPE_STRSIZE 1 | ||
| 399 | |||
| 260 | /* Fetch type information table */ | 400 | /* Fetch type information table */ |
| 261 | static const struct fetch_type { | 401 | static const struct fetch_type { |
| 262 | const char *name; /* Name of type */ | 402 | const char *name; /* Name of type */ |
| @@ -264,14 +404,16 @@ static const struct fetch_type { | |||
| 264 | int is_signed; /* Signed flag */ | 404 | int is_signed; /* Signed flag */ |
| 265 | print_type_func_t print; /* Print functions */ | 405 | print_type_func_t print; /* Print functions */ |
| 266 | const char *fmt; /* Fromat string */ | 406 | const char *fmt; /* Fromat string */ |
| 407 | const char *fmttype; /* Name in format file */ | ||
| 267 | /* Fetch functions */ | 408 | /* Fetch functions */ |
| 268 | fetch_func_t reg; | 409 | fetch_func_t fetch[FETCH_MTD_END]; |
| 269 | fetch_func_t stack; | ||
| 270 | fetch_func_t retval; | ||
| 271 | fetch_func_t memory; | ||
| 272 | fetch_func_t symbol; | ||
| 273 | fetch_func_t deref; | ||
| 274 | } fetch_type_table[] = { | 410 | } fetch_type_table[] = { |
| 411 | /* Special types */ | ||
| 412 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | ||
| 413 | sizeof(u32), 1, "__data_loc char[]"), | ||
| 414 | [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, | ||
| 415 | string_size, sizeof(u32), 0, "u32"), | ||
| 416 | /* Basic types */ | ||
| 275 | ASSIGN_FETCH_TYPE(u8, u8, 0), | 417 | ASSIGN_FETCH_TYPE(u8, u8, 0), |
| 276 | ASSIGN_FETCH_TYPE(u16, u16, 0), | 418 | ASSIGN_FETCH_TYPE(u16, u16, 0), |
| 277 | ASSIGN_FETCH_TYPE(u32, u32, 0), | 419 | ASSIGN_FETCH_TYPE(u32, u32, 0), |
| @@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, | |||
| 302 | *(unsigned long *)dest = kernel_stack_pointer(regs); | 444 | *(unsigned long *)dest = kernel_stack_pointer(regs); |
| 303 | } | 445 | } |
| 304 | 446 | ||
| 447 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | ||
| 448 | fetch_func_t orig_fn) | ||
| 449 | { | ||
| 450 | int i; | ||
| 451 | |||
| 452 | if (type != &fetch_type_table[FETCH_TYPE_STRING]) | ||
| 453 | return NULL; /* Only string type needs size function */ | ||
| 454 | for (i = 0; i < FETCH_MTD_END; i++) | ||
| 455 | if (type->fetch[i] == orig_fn) | ||
| 456 | return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; | ||
| 457 | |||
| 458 | WARN_ON(1); /* This should not happen */ | ||
| 459 | return NULL; | ||
| 460 | } | ||
| 461 | |||
| 305 | /** | 462 | /** |
| 306 | * Kprobe event core functions | 463 | * Kprobe event core functions |
| 307 | */ | 464 | */ |
| 308 | 465 | ||
| 309 | struct probe_arg { | 466 | struct probe_arg { |
| 310 | struct fetch_param fetch; | 467 | struct fetch_param fetch; |
| 468 | struct fetch_param fetch_size; | ||
| 311 | unsigned int offset; /* Offset from argument entry */ | 469 | unsigned int offset; /* Offset from argument entry */ |
| 312 | const char *name; /* Name of this argument */ | 470 | const char *name; /* Name of this argument */ |
| 313 | const char *comm; /* Command of this argument */ | 471 | const char *comm; /* Command of this argument */ |
| @@ -429,9 +587,9 @@ error: | |||
| 429 | 587 | ||
| 430 | static void free_probe_arg(struct probe_arg *arg) | 588 | static void free_probe_arg(struct probe_arg *arg) |
| 431 | { | 589 | { |
| 432 | if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) | 590 | if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) |
| 433 | free_deref_fetch_param(arg->fetch.data); | 591 | free_deref_fetch_param(arg->fetch.data); |
| 434 | else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) | 592 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) |
| 435 | free_symbol_cache(arg->fetch.data); | 593 | free_symbol_cache(arg->fetch.data); |
| 436 | kfree(arg->name); | 594 | kfree(arg->name); |
| 437 | kfree(arg->comm); | 595 | kfree(arg->comm); |
| @@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
| 548 | 706 | ||
| 549 | if (strcmp(arg, "retval") == 0) { | 707 | if (strcmp(arg, "retval") == 0) { |
| 550 | if (is_return) | 708 | if (is_return) |
| 551 | f->fn = t->retval; | 709 | f->fn = t->fetch[FETCH_MTD_retval]; |
| 552 | else | 710 | else |
| 553 | ret = -EINVAL; | 711 | ret = -EINVAL; |
| 554 | } else if (strncmp(arg, "stack", 5) == 0) { | 712 | } else if (strncmp(arg, "stack", 5) == 0) { |
| @@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
| 562 | if (ret || param > PARAM_MAX_STACK) | 720 | if (ret || param > PARAM_MAX_STACK) |
| 563 | ret = -EINVAL; | 721 | ret = -EINVAL; |
| 564 | else { | 722 | else { |
| 565 | f->fn = t->stack; | 723 | f->fn = t->fetch[FETCH_MTD_stack]; |
| 566 | f->data = (void *)param; | 724 | f->data = (void *)param; |
| 567 | } | 725 | } |
| 568 | } else | 726 | } else |
| @@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 588 | case '%': /* named register */ | 746 | case '%': /* named register */ |
| 589 | ret = regs_query_register_offset(arg + 1); | 747 | ret = regs_query_register_offset(arg + 1); |
| 590 | if (ret >= 0) { | 748 | if (ret >= 0) { |
| 591 | f->fn = t->reg; | 749 | f->fn = t->fetch[FETCH_MTD_reg]; |
| 592 | f->data = (void *)(unsigned long)ret; | 750 | f->data = (void *)(unsigned long)ret; |
| 593 | ret = 0; | 751 | ret = 0; |
| 594 | } | 752 | } |
| @@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 598 | ret = strict_strtoul(arg + 1, 0, ¶m); | 756 | ret = strict_strtoul(arg + 1, 0, ¶m); |
| 599 | if (ret) | 757 | if (ret) |
| 600 | break; | 758 | break; |
| 601 | f->fn = t->memory; | 759 | f->fn = t->fetch[FETCH_MTD_memory]; |
| 602 | f->data = (void *)param; | 760 | f->data = (void *)param; |
| 603 | } else { | 761 | } else { |
| 604 | ret = split_symbol_offset(arg + 1, &offset); | 762 | ret = split_symbol_offset(arg + 1, &offset); |
| @@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 606 | break; | 764 | break; |
| 607 | f->data = alloc_symbol_cache(arg + 1, offset); | 765 | f->data = alloc_symbol_cache(arg + 1, offset); |
| 608 | if (f->data) | 766 | if (f->data) |
| 609 | f->fn = t->symbol; | 767 | f->fn = t->fetch[FETCH_MTD_symbol]; |
| 610 | } | 768 | } |
| 611 | break; | 769 | break; |
| 612 | case '+': /* deref memory */ | 770 | case '+': /* deref memory */ |
| @@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 636 | if (ret) | 794 | if (ret) |
| 637 | kfree(dprm); | 795 | kfree(dprm); |
| 638 | else { | 796 | else { |
| 639 | f->fn = t->deref; | 797 | f->fn = t->fetch[FETCH_MTD_deref]; |
| 640 | f->data = (void *)dprm; | 798 | f->data = (void *)dprm; |
| 641 | } | 799 | } |
| 642 | } | 800 | } |
| 643 | break; | 801 | break; |
| 644 | } | 802 | } |
| 645 | if (!ret && !f->fn) | 803 | if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ |
| 804 | pr_info("%s type has no corresponding fetch method.\n", | ||
| 805 | t->name); | ||
| 646 | ret = -EINVAL; | 806 | ret = -EINVAL; |
| 807 | } | ||
| 647 | return ret; | 808 | return ret; |
| 648 | } | 809 | } |
| 649 | 810 | ||
| @@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
| 652 | struct probe_arg *parg, int is_return) | 813 | struct probe_arg *parg, int is_return) |
| 653 | { | 814 | { |
| 654 | const char *t; | 815 | const char *t; |
| 816 | int ret; | ||
| 655 | 817 | ||
| 656 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 818 | if (strlen(arg) > MAX_ARGSTR_LEN) { |
| 657 | pr_info("Argument is too long.: %s\n", arg); | 819 | pr_info("Argument is too long.: %s\n", arg); |
| @@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, | |||
| 674 | } | 836 | } |
| 675 | parg->offset = tp->size; | 837 | parg->offset = tp->size; |
| 676 | tp->size += parg->type->size; | 838 | tp->size += parg->type->size; |
| 677 | return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); | 839 | ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); |
| 840 | if (ret >= 0) { | ||
| 841 | parg->fetch_size.fn = get_fetch_size_function(parg->type, | ||
| 842 | parg->fetch.fn); | ||
| 843 | parg->fetch_size.data = parg->fetch.data; | ||
| 844 | } | ||
| 845 | return ret; | ||
| 678 | } | 846 | } |
| 679 | 847 | ||
| 680 | /* Return 1 if name is reserved or already used by another argument */ | 848 | /* Return 1 if name is reserved or already used by another argument */ |
| @@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) | |||
| 757 | pr_info("Delete command needs an event name.\n"); | 925 | pr_info("Delete command needs an event name.\n"); |
| 758 | return -EINVAL; | 926 | return -EINVAL; |
| 759 | } | 927 | } |
| 928 | mutex_lock(&probe_lock); | ||
| 760 | tp = find_probe_event(event, group); | 929 | tp = find_probe_event(event, group); |
| 761 | if (!tp) { | 930 | if (!tp) { |
| 931 | mutex_unlock(&probe_lock); | ||
| 762 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 932 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
| 763 | return -ENOENT; | 933 | return -ENOENT; |
| 764 | } | 934 | } |
| 765 | /* delete an event */ | 935 | /* delete an event */ |
| 766 | unregister_trace_probe(tp); | 936 | unregister_trace_probe(tp); |
| 767 | free_trace_probe(tp); | 937 | free_trace_probe(tp); |
| 938 | mutex_unlock(&probe_lock); | ||
| 768 | return 0; | 939 | return 0; |
| 769 | } | 940 | } |
| 770 | 941 | ||
| @@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = { | |||
| 1043 | .release = seq_release, | 1214 | .release = seq_release, |
| 1044 | }; | 1215 | }; |
| 1045 | 1216 | ||
| 1217 | /* Sum up total data length for dynamic arraies (strings) */ | ||
| 1218 | static __kprobes int __get_data_size(struct trace_probe *tp, | ||
| 1219 | struct pt_regs *regs) | ||
| 1220 | { | ||
| 1221 | int i, ret = 0; | ||
| 1222 | u32 len; | ||
| 1223 | |||
| 1224 | for (i = 0; i < tp->nr_args; i++) | ||
| 1225 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
| 1226 | call_fetch(&tp->args[i].fetch_size, regs, &len); | ||
| 1227 | ret += len; | ||
| 1228 | } | ||
| 1229 | |||
| 1230 | return ret; | ||
| 1231 | } | ||
| 1232 | |||
| 1233 | /* Store the value of each argument */ | ||
| 1234 | static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, | ||
| 1235 | struct pt_regs *regs, | ||
| 1236 | u8 *data, int maxlen) | ||
| 1237 | { | ||
| 1238 | int i; | ||
| 1239 | u32 end = tp->size; | ||
| 1240 | u32 *dl; /* Data (relative) location */ | ||
| 1241 | |||
| 1242 | for (i = 0; i < tp->nr_args; i++) { | ||
| 1243 | if (unlikely(tp->args[i].fetch_size.fn)) { | ||
| 1244 | /* | ||
| 1245 | * First, we set the relative location and | ||
| 1246 | * maximum data length to *dl | ||
| 1247 | */ | ||
| 1248 | dl = (u32 *)(data + tp->args[i].offset); | ||
| 1249 | *dl = make_data_rloc(maxlen, end - tp->args[i].offset); | ||
| 1250 | /* Then try to fetch string or dynamic array data */ | ||
| 1251 | call_fetch(&tp->args[i].fetch, regs, dl); | ||
| 1252 | /* Reduce maximum length */ | ||
| 1253 | end += get_rloc_len(*dl); | ||
| 1254 | maxlen -= get_rloc_len(*dl); | ||
| 1255 | /* Trick here, convert data_rloc to data_loc */ | ||
| 1256 | *dl = convert_rloc_to_loc(*dl, | ||
| 1257 | ent_size + tp->args[i].offset); | ||
| 1258 | } else | ||
| 1259 | /* Just fetching data normally */ | ||
| 1260 | call_fetch(&tp->args[i].fetch, regs, | ||
| 1261 | data + tp->args[i].offset); | ||
| 1262 | } | ||
| 1263 | } | ||
| 1264 | |||
| 1046 | /* Kprobe handler */ | 1265 | /* Kprobe handler */ |
| 1047 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | 1266 | static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) |
| 1048 | { | 1267 | { |
| @@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 1050 | struct kprobe_trace_entry_head *entry; | 1269 | struct kprobe_trace_entry_head *entry; |
| 1051 | struct ring_buffer_event *event; | 1270 | struct ring_buffer_event *event; |
| 1052 | struct ring_buffer *buffer; | 1271 | struct ring_buffer *buffer; |
| 1053 | u8 *data; | 1272 | int size, dsize, pc; |
| 1054 | int size, i, pc; | ||
| 1055 | unsigned long irq_flags; | 1273 | unsigned long irq_flags; |
| 1056 | struct ftrace_event_call *call = &tp->call; | 1274 | struct ftrace_event_call *call = &tp->call; |
| 1057 | 1275 | ||
| @@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 1060 | local_save_flags(irq_flags); | 1278 | local_save_flags(irq_flags); |
| 1061 | pc = preempt_count(); | 1279 | pc = preempt_count(); |
| 1062 | 1280 | ||
| 1063 | size = sizeof(*entry) + tp->size; | 1281 | dsize = __get_data_size(tp, regs); |
| 1282 | size = sizeof(*entry) + tp->size + dsize; | ||
| 1064 | 1283 | ||
| 1065 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1284 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
| 1066 | size, irq_flags, pc); | 1285 | size, irq_flags, pc); |
| @@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 1069 | 1288 | ||
| 1070 | entry = ring_buffer_event_data(event); | 1289 | entry = ring_buffer_event_data(event); |
| 1071 | entry->ip = (unsigned long)kp->addr; | 1290 | entry->ip = (unsigned long)kp->addr; |
| 1072 | data = (u8 *)&entry[1]; | 1291 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1073 | for (i = 0; i < tp->nr_args; i++) | ||
| 1074 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
| 1075 | 1292 | ||
| 1076 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1293 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 1077 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1294 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
| @@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
| 1085 | struct kretprobe_trace_entry_head *entry; | 1302 | struct kretprobe_trace_entry_head *entry; |
| 1086 | struct ring_buffer_event *event; | 1303 | struct ring_buffer_event *event; |
| 1087 | struct ring_buffer *buffer; | 1304 | struct ring_buffer *buffer; |
| 1088 | u8 *data; | 1305 | int size, pc, dsize; |
| 1089 | int size, i, pc; | ||
| 1090 | unsigned long irq_flags; | 1306 | unsigned long irq_flags; |
| 1091 | struct ftrace_event_call *call = &tp->call; | 1307 | struct ftrace_event_call *call = &tp->call; |
| 1092 | 1308 | ||
| 1093 | local_save_flags(irq_flags); | 1309 | local_save_flags(irq_flags); |
| 1094 | pc = preempt_count(); | 1310 | pc = preempt_count(); |
| 1095 | 1311 | ||
| 1096 | size = sizeof(*entry) + tp->size; | 1312 | dsize = __get_data_size(tp, regs); |
| 1313 | size = sizeof(*entry) + tp->size + dsize; | ||
| 1097 | 1314 | ||
| 1098 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, | 1315 | event = trace_current_buffer_lock_reserve(&buffer, call->event.type, |
| 1099 | size, irq_flags, pc); | 1316 | size, irq_flags, pc); |
| @@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
| 1103 | entry = ring_buffer_event_data(event); | 1320 | entry = ring_buffer_event_data(event); |
| 1104 | entry->func = (unsigned long)tp->rp.kp.addr; | 1321 | entry->func = (unsigned long)tp->rp.kp.addr; |
| 1105 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1322 | entry->ret_ip = (unsigned long)ri->ret_addr; |
| 1106 | data = (u8 *)&entry[1]; | 1323 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1107 | for (i = 0; i < tp->nr_args; i++) | ||
| 1108 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
| 1109 | 1324 | ||
| 1110 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1325 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 1111 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1326 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); |
| @@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, | |||
| 1137 | data = (u8 *)&field[1]; | 1352 | data = (u8 *)&field[1]; |
| 1138 | for (i = 0; i < tp->nr_args; i++) | 1353 | for (i = 0; i < tp->nr_args; i++) |
| 1139 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1354 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1140 | data + tp->args[i].offset)) | 1355 | data + tp->args[i].offset, field)) |
| 1141 | goto partial; | 1356 | goto partial; |
| 1142 | 1357 | ||
| 1143 | if (!trace_seq_puts(s, "\n")) | 1358 | if (!trace_seq_puts(s, "\n")) |
| @@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, | |||
| 1179 | data = (u8 *)&field[1]; | 1394 | data = (u8 *)&field[1]; |
| 1180 | for (i = 0; i < tp->nr_args; i++) | 1395 | for (i = 0; i < tp->nr_args; i++) |
| 1181 | if (!tp->args[i].type->print(s, tp->args[i].name, | 1396 | if (!tp->args[i].type->print(s, tp->args[i].name, |
| 1182 | data + tp->args[i].offset)) | 1397 | data + tp->args[i].offset, field)) |
| 1183 | goto partial; | 1398 | goto partial; |
| 1184 | 1399 | ||
| 1185 | if (!trace_seq_puts(s, "\n")) | 1400 | if (!trace_seq_puts(s, "\n")) |
| @@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call) | |||
| 1214 | } | 1429 | } |
| 1215 | } | 1430 | } |
| 1216 | 1431 | ||
| 1217 | static int probe_event_raw_init(struct ftrace_event_call *event_call) | ||
| 1218 | { | ||
| 1219 | return 0; | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | #undef DEFINE_FIELD | 1432 | #undef DEFINE_FIELD |
| 1223 | #define DEFINE_FIELD(type, item, name, is_signed) \ | 1433 | #define DEFINE_FIELD(type, item, name, is_signed) \ |
| 1224 | do { \ | 1434 | do { \ |
| @@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
| 1239 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | 1449 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); |
| 1240 | /* Set argument names as fields */ | 1450 | /* Set argument names as fields */ |
| 1241 | for (i = 0; i < tp->nr_args; i++) { | 1451 | for (i = 0; i < tp->nr_args; i++) { |
| 1242 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1452 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
| 1243 | tp->args[i].name, | 1453 | tp->args[i].name, |
| 1244 | sizeof(field) + tp->args[i].offset, | 1454 | sizeof(field) + tp->args[i].offset, |
| 1245 | tp->args[i].type->size, | 1455 | tp->args[i].type->size, |
| @@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
| 1261 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); | 1471 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); |
| 1262 | /* Set argument names as fields */ | 1472 | /* Set argument names as fields */ |
| 1263 | for (i = 0; i < tp->nr_args; i++) { | 1473 | for (i = 0; i < tp->nr_args; i++) { |
| 1264 | ret = trace_define_field(event_call, tp->args[i].type->name, | 1474 | ret = trace_define_field(event_call, tp->args[i].type->fmttype, |
| 1265 | tp->args[i].name, | 1475 | tp->args[i].name, |
| 1266 | sizeof(field) + tp->args[i].offset, | 1476 | sizeof(field) + tp->args[i].offset, |
| 1267 | tp->args[i].type->size, | 1477 | tp->args[i].type->size, |
| @@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | |||
| 1301 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); | 1511 | pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); |
| 1302 | 1512 | ||
| 1303 | for (i = 0; i < tp->nr_args; i++) { | 1513 | for (i = 0; i < tp->nr_args; i++) { |
| 1304 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | 1514 | if (strcmp(tp->args[i].type->name, "string") == 0) |
| 1305 | tp->args[i].name); | 1515 | pos += snprintf(buf + pos, LEN_OR_ZERO, |
| 1516 | ", __get_str(%s)", | ||
| 1517 | tp->args[i].name); | ||
| 1518 | else | ||
| 1519 | pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", | ||
| 1520 | tp->args[i].name); | ||
| 1306 | } | 1521 | } |
| 1307 | 1522 | ||
| 1308 | #undef LEN_OR_ZERO | 1523 | #undef LEN_OR_ZERO |
| @@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
| 1339 | struct ftrace_event_call *call = &tp->call; | 1554 | struct ftrace_event_call *call = &tp->call; |
| 1340 | struct kprobe_trace_entry_head *entry; | 1555 | struct kprobe_trace_entry_head *entry; |
| 1341 | struct hlist_head *head; | 1556 | struct hlist_head *head; |
| 1342 | u8 *data; | 1557 | int size, __size, dsize; |
| 1343 | int size, __size, i; | ||
| 1344 | int rctx; | 1558 | int rctx; |
| 1345 | 1559 | ||
| 1346 | __size = sizeof(*entry) + tp->size; | 1560 | dsize = __get_data_size(tp, regs); |
| 1561 | __size = sizeof(*entry) + tp->size + dsize; | ||
| 1347 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1562 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1348 | size -= sizeof(u32); | 1563 | size -= sizeof(u32); |
| 1349 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1564 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
| @@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, | |||
| 1355 | return; | 1570 | return; |
| 1356 | 1571 | ||
| 1357 | entry->ip = (unsigned long)kp->addr; | 1572 | entry->ip = (unsigned long)kp->addr; |
| 1358 | data = (u8 *)&entry[1]; | 1573 | memset(&entry[1], 0, dsize); |
| 1359 | for (i = 0; i < tp->nr_args; i++) | 1574 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1360 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
| 1361 | 1575 | ||
| 1362 | head = this_cpu_ptr(call->perf_events); | 1576 | head = this_cpu_ptr(call->perf_events); |
| 1363 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); | 1577 | perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); |
| @@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
| 1371 | struct ftrace_event_call *call = &tp->call; | 1585 | struct ftrace_event_call *call = &tp->call; |
| 1372 | struct kretprobe_trace_entry_head *entry; | 1586 | struct kretprobe_trace_entry_head *entry; |
| 1373 | struct hlist_head *head; | 1587 | struct hlist_head *head; |
| 1374 | u8 *data; | 1588 | int size, __size, dsize; |
| 1375 | int size, __size, i; | ||
| 1376 | int rctx; | 1589 | int rctx; |
| 1377 | 1590 | ||
| 1378 | __size = sizeof(*entry) + tp->size; | 1591 | dsize = __get_data_size(tp, regs); |
| 1592 | __size = sizeof(*entry) + tp->size + dsize; | ||
| 1379 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1593 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
| 1380 | size -= sizeof(u32); | 1594 | size -= sizeof(u32); |
| 1381 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | 1595 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
| @@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
| 1388 | 1602 | ||
| 1389 | entry->func = (unsigned long)tp->rp.kp.addr; | 1603 | entry->func = (unsigned long)tp->rp.kp.addr; |
| 1390 | entry->ret_ip = (unsigned long)ri->ret_addr; | 1604 | entry->ret_ip = (unsigned long)ri->ret_addr; |
| 1391 | data = (u8 *)&entry[1]; | 1605 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1392 | for (i = 0; i < tp->nr_args; i++) | ||
| 1393 | call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); | ||
| 1394 | 1606 | ||
| 1395 | head = this_cpu_ptr(call->perf_events); | 1607 | head = this_cpu_ptr(call->perf_events); |
| 1396 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1608 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); |
| @@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp) | |||
| 1486 | int ret; | 1698 | int ret; |
| 1487 | 1699 | ||
| 1488 | /* Initialize ftrace_event_call */ | 1700 | /* Initialize ftrace_event_call */ |
| 1701 | INIT_LIST_HEAD(&call->class->fields); | ||
| 1489 | if (probe_is_return(tp)) { | 1702 | if (probe_is_return(tp)) { |
| 1490 | INIT_LIST_HEAD(&call->class->fields); | ||
| 1491 | call->event.funcs = &kretprobe_funcs; | 1703 | call->event.funcs = &kretprobe_funcs; |
| 1492 | call->class->raw_init = probe_event_raw_init; | ||
| 1493 | call->class->define_fields = kretprobe_event_define_fields; | 1704 | call->class->define_fields = kretprobe_event_define_fields; |
| 1494 | } else { | 1705 | } else { |
| 1495 | INIT_LIST_HEAD(&call->class->fields); | ||
| 1496 | call->event.funcs = &kprobe_funcs; | 1706 | call->event.funcs = &kprobe_funcs; |
| 1497 | call->class->raw_init = probe_event_raw_init; | ||
| 1498 | call->class->define_fields = kprobe_event_define_fields; | 1707 | call->class->define_fields = kprobe_event_define_fields; |
| 1499 | } | 1708 | } |
| 1500 | if (set_print_fmt(tp) < 0) | 1709 | if (set_print_fmt(tp) < 0) |
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b65..000000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null | |||
| @@ -1,508 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * trace_ksym.c - Kernel Symbol Tracer | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2009 | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <linux/kallsyms.h> | ||
| 22 | #include <linux/uaccess.h> | ||
| 23 | #include <linux/debugfs.h> | ||
| 24 | #include <linux/ftrace.h> | ||
| 25 | #include <linux/module.h> | ||
| 26 | #include <linux/slab.h> | ||
| 27 | #include <linux/fs.h> | ||
| 28 | |||
| 29 | #include "trace_output.h" | ||
| 30 | #include "trace.h" | ||
| 31 | |||
| 32 | #include <linux/hw_breakpoint.h> | ||
| 33 | #include <asm/hw_breakpoint.h> | ||
| 34 | |||
| 35 | #include <asm/atomic.h> | ||
| 36 | |||
| 37 | #define KSYM_TRACER_OP_LEN 3 /* rw- */ | ||
| 38 | |||
| 39 | struct trace_ksym { | ||
| 40 | struct perf_event **ksym_hbp; | ||
| 41 | struct perf_event_attr attr; | ||
| 42 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
| 43 | atomic64_t counter; | ||
| 44 | #endif | ||
| 45 | struct hlist_node ksym_hlist; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static struct trace_array *ksym_trace_array; | ||
| 49 | |||
| 50 | static unsigned int ksym_tracing_enabled; | ||
| 51 | |||
| 52 | static HLIST_HEAD(ksym_filter_head); | ||
| 53 | |||
| 54 | static DEFINE_MUTEX(ksym_tracer_mutex); | ||
| 55 | |||
| 56 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
| 57 | |||
| 58 | #define MAX_UL_INT 0xffffffff | ||
| 59 | |||
| 60 | void ksym_collect_stats(unsigned long hbp_hit_addr) | ||
| 61 | { | ||
| 62 | struct hlist_node *node; | ||
| 63 | struct trace_ksym *entry; | ||
| 64 | |||
| 65 | rcu_read_lock(); | ||
| 66 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
| 67 | if (entry->attr.bp_addr == hbp_hit_addr) { | ||
| 68 | atomic64_inc(&entry->counter); | ||
| 69 | break; | ||
| 70 | } | ||
| 71 | } | ||
| 72 | rcu_read_unlock(); | ||
| 73 | } | ||
| 74 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
| 75 | |||
| 76 | void ksym_hbp_handler(struct perf_event *hbp, int nmi, | ||
| 77 | struct perf_sample_data *data, | ||
| 78 | struct pt_regs *regs) | ||
| 79 | { | ||
| 80 | struct ring_buffer_event *event; | ||
| 81 | struct ksym_trace_entry *entry; | ||
| 82 | struct ring_buffer *buffer; | ||
| 83 | int pc; | ||
| 84 | |||
| 85 | if (!ksym_tracing_enabled) | ||
| 86 | return; | ||
| 87 | |||
| 88 | buffer = ksym_trace_array->buffer; | ||
| 89 | |||
| 90 | pc = preempt_count(); | ||
| 91 | |||
| 92 | event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, | ||
| 93 | sizeof(*entry), 0, pc); | ||
| 94 | if (!event) | ||
| 95 | return; | ||
| 96 | |||
| 97 | entry = ring_buffer_event_data(event); | ||
| 98 | entry->ip = instruction_pointer(regs); | ||
| 99 | entry->type = hw_breakpoint_type(hbp); | ||
| 100 | entry->addr = hw_breakpoint_addr(hbp); | ||
| 101 | strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); | ||
| 102 | |||
| 103 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
| 104 | ksym_collect_stats(hw_breakpoint_addr(hbp)); | ||
| 105 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
| 106 | |||
| 107 | trace_buffer_unlock_commit(buffer, event, 0, pc); | ||
| 108 | } | ||
| 109 | |||
| 110 | /* Valid access types are represented as | ||
| 111 | * | ||
| 112 | * rw- : Set Read/Write Access Breakpoint | ||
| 113 | * -w- : Set Write Access Breakpoint | ||
| 114 | * --- : Clear Breakpoints | ||
| 115 | * --x : Set Execution Break points (Not available yet) | ||
| 116 | * | ||
| 117 | */ | ||
| 118 | static int ksym_trace_get_access_type(char *str) | ||
| 119 | { | ||
| 120 | int access = 0; | ||
| 121 | |||
| 122 | if (str[0] == 'r') | ||
| 123 | access |= HW_BREAKPOINT_R; | ||
| 124 | |||
| 125 | if (str[1] == 'w') | ||
| 126 | access |= HW_BREAKPOINT_W; | ||
| 127 | |||
| 128 | if (str[2] == 'x') | ||
| 129 | access |= HW_BREAKPOINT_X; | ||
| 130 | |||
| 131 | switch (access) { | ||
| 132 | case HW_BREAKPOINT_R: | ||
| 133 | case HW_BREAKPOINT_W: | ||
| 134 | case HW_BREAKPOINT_W | HW_BREAKPOINT_R: | ||
| 135 | return access; | ||
| 136 | default: | ||
| 137 | return -EINVAL; | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | /* | ||
| 142 | * There can be several possible malformed requests and we attempt to capture | ||
| 143 | * all of them. We enumerate some of the rules | ||
| 144 | * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. | ||
| 145 | * i.e. multiple ':' symbols disallowed. Possible uses are of the form | ||
| 146 | * <module>:<ksym_name>:<op>. | ||
| 147 | * 2. No delimiter symbol ':' in the input string | ||
| 148 | * 3. Spurious operator symbols or symbols not in their respective positions | ||
| 149 | * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file | ||
| 150 | * 5. Kernel symbol not a part of /proc/kallsyms | ||
| 151 | * 6. Duplicate requests | ||
| 152 | */ | ||
| 153 | static int parse_ksym_trace_str(char *input_string, char **ksymname, | ||
| 154 | unsigned long *addr) | ||
| 155 | { | ||
| 156 | int ret; | ||
| 157 | |||
| 158 | *ksymname = strsep(&input_string, ":"); | ||
| 159 | *addr = kallsyms_lookup_name(*ksymname); | ||
| 160 | |||
| 161 | /* Check for malformed request: (2), (1) and (5) */ | ||
| 162 | if ((!input_string) || | ||
| 163 | (strlen(input_string) != KSYM_TRACER_OP_LEN) || | ||
| 164 | (*addr == 0)) | ||
| 165 | return -EINVAL;; | ||
| 166 | |||
| 167 | ret = ksym_trace_get_access_type(input_string); | ||
| 168 | |||
| 169 | return ret; | ||
| 170 | } | ||
| 171 | |||
| 172 | int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) | ||
| 173 | { | ||
| 174 | struct trace_ksym *entry; | ||
| 175 | int ret = -ENOMEM; | ||
| 176 | |||
| 177 | entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); | ||
| 178 | if (!entry) | ||
| 179 | return -ENOMEM; | ||
| 180 | |||
| 181 | hw_breakpoint_init(&entry->attr); | ||
| 182 | |||
| 183 | entry->attr.bp_type = op; | ||
| 184 | entry->attr.bp_addr = addr; | ||
| 185 | entry->attr.bp_len = HW_BREAKPOINT_LEN_4; | ||
| 186 | |||
| 187 | entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, | ||
| 188 | ksym_hbp_handler); | ||
| 189 | |||
| 190 | if (IS_ERR(entry->ksym_hbp)) { | ||
| 191 | ret = PTR_ERR(entry->ksym_hbp); | ||
| 192 | if (ret == -ENOSPC) { | ||
| 193 | printk(KERN_ERR "ksym_tracer: Maximum limit reached." | ||
| 194 | " No new requests for tracing can be accepted now.\n"); | ||
| 195 | } else { | ||
| 196 | printk(KERN_INFO "ksym_tracer request failed. Try again" | ||
| 197 | " later!!\n"); | ||
| 198 | } | ||
| 199 | goto err; | ||
| 200 | } | ||
| 201 | |||
| 202 | hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); | ||
| 203 | |||
| 204 | return 0; | ||
| 205 | |||
| 206 | err: | ||
| 207 | kfree(entry); | ||
| 208 | |||
| 209 | return ret; | ||
| 210 | } | ||
| 211 | |||
| 212 | static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, | ||
| 213 | size_t count, loff_t *ppos) | ||
| 214 | { | ||
| 215 | struct trace_ksym *entry; | ||
| 216 | struct hlist_node *node; | ||
| 217 | struct trace_seq *s; | ||
| 218 | ssize_t cnt = 0; | ||
| 219 | int ret; | ||
| 220 | |||
| 221 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
| 222 | if (!s) | ||
| 223 | return -ENOMEM; | ||
| 224 | trace_seq_init(s); | ||
| 225 | |||
| 226 | mutex_lock(&ksym_tracer_mutex); | ||
| 227 | |||
| 228 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
| 229 | ret = trace_seq_printf(s, "%pS:", | ||
| 230 | (void *)(unsigned long)entry->attr.bp_addr); | ||
| 231 | if (entry->attr.bp_type == HW_BREAKPOINT_R) | ||
| 232 | ret = trace_seq_puts(s, "r--\n"); | ||
| 233 | else if (entry->attr.bp_type == HW_BREAKPOINT_W) | ||
| 234 | ret = trace_seq_puts(s, "-w-\n"); | ||
| 235 | else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) | ||
| 236 | ret = trace_seq_puts(s, "rw-\n"); | ||
| 237 | WARN_ON_ONCE(!ret); | ||
| 238 | } | ||
| 239 | |||
| 240 | cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | ||
| 241 | |||
| 242 | mutex_unlock(&ksym_tracer_mutex); | ||
| 243 | |||
| 244 | kfree(s); | ||
| 245 | |||
| 246 | return cnt; | ||
| 247 | } | ||
| 248 | |||
| 249 | static void __ksym_trace_reset(void) | ||
| 250 | { | ||
| 251 | struct trace_ksym *entry; | ||
| 252 | struct hlist_node *node, *node1; | ||
| 253 | |||
| 254 | mutex_lock(&ksym_tracer_mutex); | ||
| 255 | hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, | ||
| 256 | ksym_hlist) { | ||
| 257 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
| 258 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
| 259 | synchronize_rcu(); | ||
| 260 | kfree(entry); | ||
| 261 | } | ||
| 262 | mutex_unlock(&ksym_tracer_mutex); | ||
| 263 | } | ||
| 264 | |||
| 265 | static ssize_t ksym_trace_filter_write(struct file *file, | ||
| 266 | const char __user *buffer, | ||
| 267 | size_t count, loff_t *ppos) | ||
| 268 | { | ||
| 269 | struct trace_ksym *entry; | ||
| 270 | struct hlist_node *node; | ||
| 271 | char *buf, *input_string, *ksymname = NULL; | ||
| 272 | unsigned long ksym_addr = 0; | ||
| 273 | int ret, op, changed = 0; | ||
| 274 | |||
| 275 | buf = kzalloc(count + 1, GFP_KERNEL); | ||
| 276 | if (!buf) | ||
| 277 | return -ENOMEM; | ||
| 278 | |||
| 279 | ret = -EFAULT; | ||
| 280 | if (copy_from_user(buf, buffer, count)) | ||
| 281 | goto out; | ||
| 282 | |||
| 283 | buf[count] = '\0'; | ||
| 284 | input_string = strstrip(buf); | ||
| 285 | |||
| 286 | /* | ||
| 287 | * Clear all breakpoints if: | ||
| 288 | * 1: echo > ksym_trace_filter | ||
| 289 | * 2: echo 0 > ksym_trace_filter | ||
| 290 | * 3: echo "*:---" > ksym_trace_filter | ||
| 291 | */ | ||
| 292 | if (!input_string[0] || !strcmp(input_string, "0") || | ||
| 293 | !strcmp(input_string, "*:---")) { | ||
| 294 | __ksym_trace_reset(); | ||
| 295 | ret = 0; | ||
| 296 | goto out; | ||
| 297 | } | ||
| 298 | |||
| 299 | ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); | ||
| 300 | if (ret < 0) | ||
| 301 | goto out; | ||
| 302 | |||
| 303 | mutex_lock(&ksym_tracer_mutex); | ||
| 304 | |||
| 305 | ret = -EINVAL; | ||
| 306 | hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { | ||
| 307 | if (entry->attr.bp_addr == ksym_addr) { | ||
| 308 | /* Check for malformed request: (6) */ | ||
| 309 | if (entry->attr.bp_type != op) | ||
| 310 | changed = 1; | ||
| 311 | else | ||
| 312 | goto out_unlock; | ||
| 313 | break; | ||
| 314 | } | ||
| 315 | } | ||
| 316 | if (changed) { | ||
| 317 | unregister_wide_hw_breakpoint(entry->ksym_hbp); | ||
| 318 | entry->attr.bp_type = op; | ||
| 319 | ret = 0; | ||
| 320 | if (op > 0) { | ||
| 321 | entry->ksym_hbp = | ||
| 322 | register_wide_hw_breakpoint(&entry->attr, | ||
| 323 | ksym_hbp_handler); | ||
| 324 | if (IS_ERR(entry->ksym_hbp)) | ||
| 325 | ret = PTR_ERR(entry->ksym_hbp); | ||
| 326 | else | ||
| 327 | goto out_unlock; | ||
| 328 | } | ||
| 329 | /* Error or "symbol:---" case: drop it */ | ||
| 330 | hlist_del_rcu(&(entry->ksym_hlist)); | ||
| 331 | synchronize_rcu(); | ||
| 332 | kfree(entry); | ||
| 333 | goto out_unlock; | ||
| 334 | } else { | ||
| 335 | /* Check for malformed request: (4) */ | ||
| 336 | if (op) | ||
| 337 | ret = process_new_ksym_entry(ksymname, op, ksym_addr); | ||
| 338 | } | ||
| 339 | out_unlock: | ||
| 340 | mutex_unlock(&ksym_tracer_mutex); | ||
| 341 | out: | ||
| 342 | kfree(buf); | ||
| 343 | return !ret ? count : ret; | ||
| 344 | } | ||
| 345 | |||
| 346 | static const struct file_operations ksym_tracing_fops = { | ||
| 347 | .open = tracing_open_generic, | ||
| 348 | .read = ksym_trace_filter_read, | ||
| 349 | .write = ksym_trace_filter_write, | ||
| 350 | }; | ||
| 351 | |||
| 352 | static void ksym_trace_reset(struct trace_array *tr) | ||
| 353 | { | ||
| 354 | ksym_tracing_enabled = 0; | ||
| 355 | __ksym_trace_reset(); | ||
| 356 | } | ||
| 357 | |||
| 358 | static int ksym_trace_init(struct trace_array *tr) | ||
| 359 | { | ||
| 360 | int cpu, ret = 0; | ||
| 361 | |||
| 362 | for_each_online_cpu(cpu) | ||
| 363 | tracing_reset(tr, cpu); | ||
| 364 | ksym_tracing_enabled = 1; | ||
| 365 | ksym_trace_array = tr; | ||
| 366 | |||
| 367 | return ret; | ||
| 368 | } | ||
| 369 | |||
| 370 | static void ksym_trace_print_header(struct seq_file *m) | ||
| 371 | { | ||
| 372 | seq_puts(m, | ||
| 373 | "# TASK-PID CPU# Symbol " | ||
| 374 | "Type Function\n"); | ||
| 375 | seq_puts(m, | ||
| 376 | "# | | | " | ||
| 377 | " | |\n"); | ||
| 378 | } | ||
| 379 | |||
| 380 | static enum print_line_t ksym_trace_output(struct trace_iterator *iter) | ||
| 381 | { | ||
| 382 | struct trace_entry *entry = iter->ent; | ||
| 383 | struct trace_seq *s = &iter->seq; | ||
| 384 | struct ksym_trace_entry *field; | ||
| 385 | char str[KSYM_SYMBOL_LEN]; | ||
| 386 | int ret; | ||
| 387 | |||
| 388 | if (entry->type != TRACE_KSYM) | ||
| 389 | return TRACE_TYPE_UNHANDLED; | ||
| 390 | |||
| 391 | trace_assign_type(field, entry); | ||
| 392 | |||
| 393 | ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, | ||
| 394 | entry->pid, iter->cpu, (char *)field->addr); | ||
| 395 | if (!ret) | ||
| 396 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 397 | |||
| 398 | switch (field->type) { | ||
| 399 | case HW_BREAKPOINT_R: | ||
| 400 | ret = trace_seq_printf(s, " R "); | ||
| 401 | break; | ||
| 402 | case HW_BREAKPOINT_W: | ||
| 403 | ret = trace_seq_printf(s, " W "); | ||
| 404 | break; | ||
| 405 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
| 406 | ret = trace_seq_printf(s, " RW "); | ||
| 407 | break; | ||
| 408 | default: | ||
| 409 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 410 | } | ||
| 411 | |||
| 412 | if (!ret) | ||
| 413 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 414 | |||
| 415 | sprint_symbol(str, field->ip); | ||
| 416 | ret = trace_seq_printf(s, "%s\n", str); | ||
| 417 | if (!ret) | ||
| 418 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 419 | |||
| 420 | return TRACE_TYPE_HANDLED; | ||
| 421 | } | ||
| 422 | |||
| 423 | struct tracer ksym_tracer __read_mostly = | ||
| 424 | { | ||
| 425 | .name = "ksym_tracer", | ||
| 426 | .init = ksym_trace_init, | ||
| 427 | .reset = ksym_trace_reset, | ||
| 428 | #ifdef CONFIG_FTRACE_SELFTEST | ||
| 429 | .selftest = trace_selftest_startup_ksym, | ||
| 430 | #endif | ||
| 431 | .print_header = ksym_trace_print_header, | ||
| 432 | .print_line = ksym_trace_output | ||
| 433 | }; | ||
| 434 | |||
| 435 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
| 436 | static int ksym_profile_show(struct seq_file *m, void *v) | ||
| 437 | { | ||
| 438 | struct hlist_node *node; | ||
| 439 | struct trace_ksym *entry; | ||
| 440 | int access_type = 0; | ||
| 441 | char fn_name[KSYM_NAME_LEN]; | ||
| 442 | |||
| 443 | seq_puts(m, " Access Type "); | ||
| 444 | seq_puts(m, " Symbol Counter\n"); | ||
| 445 | seq_puts(m, " ----------- "); | ||
| 446 | seq_puts(m, " ------ -------\n"); | ||
| 447 | |||
| 448 | rcu_read_lock(); | ||
| 449 | hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { | ||
| 450 | |||
| 451 | access_type = entry->attr.bp_type; | ||
| 452 | |||
| 453 | switch (access_type) { | ||
| 454 | case HW_BREAKPOINT_R: | ||
| 455 | seq_puts(m, " R "); | ||
| 456 | break; | ||
| 457 | case HW_BREAKPOINT_W: | ||
| 458 | seq_puts(m, " W "); | ||
| 459 | break; | ||
| 460 | case HW_BREAKPOINT_R | HW_BREAKPOINT_W: | ||
| 461 | seq_puts(m, " RW "); | ||
| 462 | break; | ||
| 463 | default: | ||
| 464 | seq_puts(m, " NA "); | ||
| 465 | } | ||
| 466 | |||
| 467 | if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) | ||
| 468 | seq_printf(m, " %-36s", fn_name); | ||
| 469 | else | ||
| 470 | seq_printf(m, " %-36s", "<NA>"); | ||
| 471 | seq_printf(m, " %15llu\n", | ||
| 472 | (unsigned long long)atomic64_read(&entry->counter)); | ||
| 473 | } | ||
| 474 | rcu_read_unlock(); | ||
| 475 | |||
| 476 | return 0; | ||
| 477 | } | ||
| 478 | |||
| 479 | static int ksym_profile_open(struct inode *node, struct file *file) | ||
| 480 | { | ||
| 481 | return single_open(file, ksym_profile_show, NULL); | ||
| 482 | } | ||
| 483 | |||
| 484 | static const struct file_operations ksym_profile_fops = { | ||
| 485 | .open = ksym_profile_open, | ||
| 486 | .read = seq_read, | ||
| 487 | .llseek = seq_lseek, | ||
| 488 | .release = single_release, | ||
| 489 | }; | ||
| 490 | #endif /* CONFIG_PROFILE_KSYM_TRACER */ | ||
| 491 | |||
| 492 | __init static int init_ksym_trace(void) | ||
| 493 | { | ||
| 494 | struct dentry *d_tracer; | ||
| 495 | |||
| 496 | d_tracer = tracing_init_dentry(); | ||
| 497 | |||
| 498 | trace_create_file("ksym_trace_filter", 0644, d_tracer, | ||
| 499 | NULL, &ksym_tracing_fops); | ||
| 500 | |||
| 501 | #ifdef CONFIG_PROFILE_KSYM_TRACER | ||
| 502 | trace_create_file("ksym_profile", 0444, d_tracer, | ||
| 503 | NULL, &ksym_profile_fops); | ||
| 504 | #endif | ||
| 505 | |||
| 506 | return register_tracer(&ksym_tracer); | ||
| 507 | } | ||
| 508 | device_initcall(init_ksym_trace); | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b4596470..02272baa2206 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -16,9 +16,6 @@ | |||
| 16 | 16 | ||
| 17 | DECLARE_RWSEM(trace_event_mutex); | 17 | DECLARE_RWSEM(trace_event_mutex); |
| 18 | 18 | ||
| 19 | DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); | ||
| 20 | EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); | ||
| 21 | |||
| 22 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; | 19 | static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; |
| 23 | 20 | ||
| 24 | static int next_event_type = __TRACE_LAST_TYPE + 1; | 21 | static int next_event_type = __TRACE_LAST_TYPE + 1; |
| @@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { | |||
| 1069 | .funcs = &trace_wake_funcs, | 1066 | .funcs = &trace_wake_funcs, |
| 1070 | }; | 1067 | }; |
| 1071 | 1068 | ||
| 1072 | /* TRACE_SPECIAL */ | ||
| 1073 | static enum print_line_t trace_special_print(struct trace_iterator *iter, | ||
| 1074 | int flags, struct trace_event *event) | ||
| 1075 | { | ||
| 1076 | struct special_entry *field; | ||
| 1077 | |||
| 1078 | trace_assign_type(field, iter->ent); | ||
| 1079 | |||
| 1080 | if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", | ||
| 1081 | field->arg1, | ||
| 1082 | field->arg2, | ||
| 1083 | field->arg3)) | ||
| 1084 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1085 | |||
| 1086 | return TRACE_TYPE_HANDLED; | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static enum print_line_t trace_special_hex(struct trace_iterator *iter, | ||
| 1090 | int flags, struct trace_event *event) | ||
| 1091 | { | ||
| 1092 | struct special_entry *field; | ||
| 1093 | struct trace_seq *s = &iter->seq; | ||
| 1094 | |||
| 1095 | trace_assign_type(field, iter->ent); | ||
| 1096 | |||
| 1097 | SEQ_PUT_HEX_FIELD_RET(s, field->arg1); | ||
| 1098 | SEQ_PUT_HEX_FIELD_RET(s, field->arg2); | ||
| 1099 | SEQ_PUT_HEX_FIELD_RET(s, field->arg3); | ||
| 1100 | |||
| 1101 | return TRACE_TYPE_HANDLED; | ||
| 1102 | } | ||
| 1103 | |||
| 1104 | static enum print_line_t trace_special_bin(struct trace_iterator *iter, | ||
| 1105 | int flags, struct trace_event *event) | ||
| 1106 | { | ||
| 1107 | struct special_entry *field; | ||
| 1108 | struct trace_seq *s = &iter->seq; | ||
| 1109 | |||
| 1110 | trace_assign_type(field, iter->ent); | ||
| 1111 | |||
| 1112 | SEQ_PUT_FIELD_RET(s, field->arg1); | ||
| 1113 | SEQ_PUT_FIELD_RET(s, field->arg2); | ||
| 1114 | SEQ_PUT_FIELD_RET(s, field->arg3); | ||
| 1115 | |||
| 1116 | return TRACE_TYPE_HANDLED; | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | static struct trace_event_functions trace_special_funcs = { | ||
| 1120 | .trace = trace_special_print, | ||
| 1121 | .raw = trace_special_print, | ||
| 1122 | .hex = trace_special_hex, | ||
| 1123 | .binary = trace_special_bin, | ||
| 1124 | }; | ||
| 1125 | |||
| 1126 | static struct trace_event trace_special_event = { | ||
| 1127 | .type = TRACE_SPECIAL, | ||
| 1128 | .funcs = &trace_special_funcs, | ||
| 1129 | }; | ||
| 1130 | |||
| 1131 | /* TRACE_STACK */ | 1069 | /* TRACE_STACK */ |
| 1132 | 1070 | ||
| 1133 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, | 1071 | static enum print_line_t trace_stack_print(struct trace_iterator *iter, |
| @@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
| 1161 | 1099 | ||
| 1162 | static struct trace_event_functions trace_stack_funcs = { | 1100 | static struct trace_event_functions trace_stack_funcs = { |
| 1163 | .trace = trace_stack_print, | 1101 | .trace = trace_stack_print, |
| 1164 | .raw = trace_special_print, | ||
| 1165 | .hex = trace_special_hex, | ||
| 1166 | .binary = trace_special_bin, | ||
| 1167 | }; | 1102 | }; |
| 1168 | 1103 | ||
| 1169 | static struct trace_event trace_stack_event = { | 1104 | static struct trace_event trace_stack_event = { |
| @@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, | |||
| 1194 | 1129 | ||
| 1195 | static struct trace_event_functions trace_user_stack_funcs = { | 1130 | static struct trace_event_functions trace_user_stack_funcs = { |
| 1196 | .trace = trace_user_stack_print, | 1131 | .trace = trace_user_stack_print, |
| 1197 | .raw = trace_special_print, | ||
| 1198 | .hex = trace_special_hex, | ||
| 1199 | .binary = trace_special_bin, | ||
| 1200 | }; | 1132 | }; |
| 1201 | 1133 | ||
| 1202 | static struct trace_event trace_user_stack_event = { | 1134 | static struct trace_event trace_user_stack_event = { |
| @@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { | |||
| 1314 | &trace_fn_event, | 1246 | &trace_fn_event, |
| 1315 | &trace_ctx_event, | 1247 | &trace_ctx_event, |
| 1316 | &trace_wake_event, | 1248 | &trace_wake_event, |
| 1317 | &trace_special_event, | ||
| 1318 | &trace_stack_event, | 1249 | &trace_stack_event, |
| 1319 | &trace_user_stack_event, | 1250 | &trace_user_stack_event, |
| 1320 | &trace_bprint_event, | 1251 | &trace_bprint_event, |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c5..4086eae6e81b 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 46 | struct trace_array_cpu *data; | 46 | struct trace_array_cpu *data; |
| 47 | unsigned long flags; | 47 | unsigned long flags; |
| 48 | long disabled; | 48 | long disabled; |
| 49 | int resched; | ||
| 50 | int cpu; | 49 | int cpu; |
| 51 | int pc; | 50 | int pc; |
| 52 | 51 | ||
| @@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 54 | return; | 53 | return; |
| 55 | 54 | ||
| 56 | pc = preempt_count(); | 55 | pc = preempt_count(); |
| 57 | resched = ftrace_preempt_disable(); | 56 | preempt_disable_notrace(); |
| 58 | 57 | ||
| 59 | cpu = raw_smp_processor_id(); | 58 | cpu = raw_smp_processor_id(); |
| 60 | if (cpu != wakeup_current_cpu) | 59 | if (cpu != wakeup_current_cpu) |
| @@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 74 | out: | 73 | out: |
| 75 | atomic_dec(&data->disabled); | 74 | atomic_dec(&data->disabled); |
| 76 | out_enable: | 75 | out_enable: |
| 77 | ftrace_preempt_enable(resched); | 76 | preempt_enable_notrace(); |
| 78 | } | 77 | } |
| 79 | 78 | ||
| 80 | static struct ftrace_ops trace_ops __read_mostly = | 79 | static struct ftrace_ops trace_ops __read_mostly = |
| @@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
| 383 | #ifdef CONFIG_FTRACE_SELFTEST | 382 | #ifdef CONFIG_FTRACE_SELFTEST |
| 384 | .selftest = trace_selftest_startup_wakeup, | 383 | .selftest = trace_selftest_startup_wakeup, |
| 385 | #endif | 384 | #endif |
| 385 | .use_max_tr = 1, | ||
| 386 | }; | 386 | }; |
| 387 | 387 | ||
| 388 | static struct tracer wakeup_rt_tracer __read_mostly = | 388 | static struct tracer wakeup_rt_tracer __read_mostly = |
| @@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
| 397 | #ifdef CONFIG_FTRACE_SELFTEST | 397 | #ifdef CONFIG_FTRACE_SELFTEST |
| 398 | .selftest = trace_selftest_startup_wakeup, | 398 | .selftest = trace_selftest_startup_wakeup, |
| 399 | #endif | 399 | #endif |
| 400 | .use_max_tr = 1, | ||
| 400 | }; | 401 | }; |
| 401 | 402 | ||
| 402 | __init static int init_wakeup_tracer(void) | 403 | __init static int init_wakeup_tracer(void) |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f0..155a415b3209 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) | |||
| 13 | case TRACE_WAKE: | 13 | case TRACE_WAKE: |
| 14 | case TRACE_STACK: | 14 | case TRACE_STACK: |
| 15 | case TRACE_PRINT: | 15 | case TRACE_PRINT: |
| 16 | case TRACE_SPECIAL: | ||
| 17 | case TRACE_BRANCH: | 16 | case TRACE_BRANCH: |
| 18 | case TRACE_GRAPH_ENT: | 17 | case TRACE_GRAPH_ENT: |
| 19 | case TRACE_GRAPH_RET: | 18 | case TRACE_GRAPH_RET: |
| 20 | case TRACE_KSYM: | ||
| 21 | return 1; | 19 | return 1; |
| 22 | } | 20 | } |
| 23 | return 0; | 21 | return 0; |
| @@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr | |||
| 691 | } | 689 | } |
| 692 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ | 690 | #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ |
| 693 | 691 | ||
| 694 | #ifdef CONFIG_SYSPROF_TRACER | ||
| 695 | int | ||
| 696 | trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) | ||
| 697 | { | ||
| 698 | unsigned long count; | ||
| 699 | int ret; | ||
| 700 | |||
| 701 | /* start the tracing */ | ||
| 702 | ret = tracer_init(trace, tr); | ||
| 703 | if (ret) { | ||
| 704 | warn_failed_init_tracer(trace, ret); | ||
| 705 | return ret; | ||
| 706 | } | ||
| 707 | |||
| 708 | /* Sleep for a 1/10 of a second */ | ||
| 709 | msleep(100); | ||
| 710 | /* stop the tracing. */ | ||
| 711 | tracing_stop(); | ||
| 712 | /* check the trace buffer */ | ||
| 713 | ret = trace_test_buffer(tr, &count); | ||
| 714 | trace->reset(tr); | ||
| 715 | tracing_start(); | ||
| 716 | |||
| 717 | if (!ret && !count) { | ||
| 718 | printk(KERN_CONT ".. no entries found .."); | ||
| 719 | ret = -1; | ||
| 720 | } | ||
| 721 | |||
| 722 | return ret; | ||
| 723 | } | ||
| 724 | #endif /* CONFIG_SYSPROF_TRACER */ | ||
| 725 | |||
| 726 | #ifdef CONFIG_BRANCH_TRACER | 692 | #ifdef CONFIG_BRANCH_TRACER |
| 727 | int | 693 | int |
| 728 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | 694 | trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) |
| @@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) | |||
| 755 | } | 721 | } |
| 756 | #endif /* CONFIG_BRANCH_TRACER */ | 722 | #endif /* CONFIG_BRANCH_TRACER */ |
| 757 | 723 | ||
| 758 | #ifdef CONFIG_KSYM_TRACER | ||
| 759 | static int ksym_selftest_dummy; | ||
| 760 | |||
| 761 | int | ||
| 762 | trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) | ||
| 763 | { | ||
| 764 | unsigned long count; | ||
| 765 | int ret; | ||
| 766 | |||
| 767 | /* start the tracing */ | ||
| 768 | ret = tracer_init(trace, tr); | ||
| 769 | if (ret) { | ||
| 770 | warn_failed_init_tracer(trace, ret); | ||
| 771 | return ret; | ||
| 772 | } | ||
| 773 | |||
| 774 | ksym_selftest_dummy = 0; | ||
| 775 | /* Register the read-write tracing request */ | ||
| 776 | |||
| 777 | ret = process_new_ksym_entry("ksym_selftest_dummy", | ||
| 778 | HW_BREAKPOINT_R | HW_BREAKPOINT_W, | ||
| 779 | (unsigned long)(&ksym_selftest_dummy)); | ||
| 780 | |||
| 781 | if (ret < 0) { | ||
| 782 | printk(KERN_CONT "ksym_trace read-write startup test failed\n"); | ||
| 783 | goto ret_path; | ||
| 784 | } | ||
| 785 | /* Perform a read and a write operation over the dummy variable to | ||
| 786 | * trigger the tracer | ||
| 787 | */ | ||
| 788 | if (ksym_selftest_dummy == 0) | ||
| 789 | ksym_selftest_dummy++; | ||
| 790 | |||
| 791 | /* stop the tracing. */ | ||
| 792 | tracing_stop(); | ||
| 793 | /* check the trace buffer */ | ||
| 794 | ret = trace_test_buffer(tr, &count); | ||
| 795 | trace->reset(tr); | ||
| 796 | tracing_start(); | ||
| 797 | |||
| 798 | /* read & write operations - one each is performed on the dummy variable | ||
| 799 | * triggering two entries in the trace buffer | ||
| 800 | */ | ||
| 801 | if (!ret && count != 2) { | ||
| 802 | printk(KERN_CONT "Ksym tracer startup test failed"); | ||
| 803 | ret = -1; | ||
| 804 | } | ||
| 805 | |||
| 806 | ret_path: | ||
| 807 | return ret; | ||
| 808 | } | ||
| 809 | #endif /* CONFIG_KSYM_TRACER */ | ||
| 810 | |||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5f..056468eae7cf 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -110,12 +110,12 @@ static inline void check_stack(void) | |||
| 110 | static void | 110 | static void |
| 111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 111 | stack_trace_call(unsigned long ip, unsigned long parent_ip) |
| 112 | { | 112 | { |
| 113 | int cpu, resched; | 113 | int cpu; |
| 114 | 114 | ||
| 115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) | 115 | if (unlikely(!ftrace_enabled || stack_trace_disabled)) |
| 116 | return; | 116 | return; |
| 117 | 117 | ||
| 118 | resched = ftrace_preempt_disable(); | 118 | preempt_disable_notrace(); |
| 119 | 119 | ||
| 120 | cpu = raw_smp_processor_id(); | 120 | cpu = raw_smp_processor_id(); |
| 121 | /* no atomic needed, we only modify this variable by this cpu */ | 121 | /* no atomic needed, we only modify this variable by this cpu */ |
| @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 127 | out: | 127 | out: |
| 128 | per_cpu(trace_active, cpu)--; | 128 | per_cpu(trace_active, cpu)--; |
| 129 | /* prevent recursion in schedule */ | 129 | /* prevent recursion in schedule */ |
| 130 | ftrace_preempt_enable(resched); | 130 | preempt_enable_notrace(); |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304b..bac752f0cfb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, | |||
| 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); | 23 | static int syscall_enter_define_fields(struct ftrace_event_call *call); |
| 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); | 24 | static int syscall_exit_define_fields(struct ftrace_event_call *call); |
| 25 | 25 | ||
| 26 | /* All syscall exit events have the same fields */ | ||
| 27 | static LIST_HEAD(syscall_exit_fields); | ||
| 28 | |||
| 26 | static struct list_head * | 29 | static struct list_head * |
| 27 | syscall_get_enter_fields(struct ftrace_event_call *call) | 30 | syscall_get_enter_fields(struct ftrace_event_call *call) |
| 28 | { | 31 | { |
| @@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) | |||
| 34 | static struct list_head * | 37 | static struct list_head * |
| 35 | syscall_get_exit_fields(struct ftrace_event_call *call) | 38 | syscall_get_exit_fields(struct ftrace_event_call *call) |
| 36 | { | 39 | { |
| 37 | struct syscall_metadata *entry = call->data; | 40 | return &syscall_exit_fields; |
| 38 | |||
| 39 | return &entry->exit_fields; | ||
| 40 | } | 41 | } |
| 41 | 42 | ||
| 42 | struct trace_event_functions enter_syscall_print_funcs = { | 43 | struct trace_event_functions enter_syscall_print_funcs = { |
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a552ca9..000000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null | |||
| @@ -1,329 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * trace stack traces | ||
| 3 | * | ||
| 4 | * Copyright (C) 2004-2008, Soeren Sandmann | ||
| 5 | * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> | ||
| 6 | * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> | ||
| 7 | */ | ||
| 8 | #include <linux/kallsyms.h> | ||
| 9 | #include <linux/debugfs.h> | ||
| 10 | #include <linux/hrtimer.h> | ||
| 11 | #include <linux/uaccess.h> | ||
| 12 | #include <linux/ftrace.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/irq.h> | ||
| 15 | #include <linux/fs.h> | ||
| 16 | |||
| 17 | #include <asm/stacktrace.h> | ||
| 18 | |||
| 19 | #include "trace.h" | ||
| 20 | |||
| 21 | static struct trace_array *sysprof_trace; | ||
| 22 | static int __read_mostly tracer_enabled; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * 1 msec sample interval by default: | ||
| 26 | */ | ||
| 27 | static unsigned long sample_period = 1000000; | ||
| 28 | static const unsigned int sample_max_depth = 512; | ||
| 29 | |||
| 30 | static DEFINE_MUTEX(sample_timer_lock); | ||
| 31 | /* | ||
| 32 | * Per CPU hrtimers that do the profiling: | ||
| 33 | */ | ||
| 34 | static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); | ||
| 35 | |||
| 36 | struct stack_frame { | ||
| 37 | const void __user *next_fp; | ||
| 38 | unsigned long return_address; | ||
| 39 | }; | ||
| 40 | |||
| 41 | static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) | ||
| 42 | { | ||
| 43 | int ret; | ||
| 44 | |||
| 45 | if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) | ||
| 46 | return 0; | ||
| 47 | |||
| 48 | ret = 1; | ||
| 49 | pagefault_disable(); | ||
| 50 | if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) | ||
| 51 | ret = 0; | ||
| 52 | pagefault_enable(); | ||
| 53 | |||
| 54 | return ret; | ||
| 55 | } | ||
| 56 | |||
| 57 | struct backtrace_info { | ||
| 58 | struct trace_array_cpu *data; | ||
| 59 | struct trace_array *tr; | ||
| 60 | int pos; | ||
| 61 | }; | ||
| 62 | |||
| 63 | static void | ||
| 64 | backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) | ||
| 65 | { | ||
| 66 | /* Ignore warnings */ | ||
| 67 | } | ||
| 68 | |||
| 69 | static void backtrace_warning(void *data, char *msg) | ||
| 70 | { | ||
| 71 | /* Ignore warnings */ | ||
| 72 | } | ||
| 73 | |||
| 74 | static int backtrace_stack(void *data, char *name) | ||
| 75 | { | ||
| 76 | /* Don't bother with IRQ stacks for now */ | ||
| 77 | return -1; | ||
| 78 | } | ||
| 79 | |||
| 80 | static void backtrace_address(void *data, unsigned long addr, int reliable) | ||
| 81 | { | ||
| 82 | struct backtrace_info *info = data; | ||
| 83 | |||
| 84 | if (info->pos < sample_max_depth && reliable) { | ||
| 85 | __trace_special(info->tr, info->data, 1, addr, 0); | ||
| 86 | |||
| 87 | info->pos++; | ||
| 88 | } | ||
| 89 | } | ||
| 90 | |||
| 91 | static const struct stacktrace_ops backtrace_ops = { | ||
| 92 | .warning = backtrace_warning, | ||
| 93 | .warning_symbol = backtrace_warning_symbol, | ||
| 94 | .stack = backtrace_stack, | ||
| 95 | .address = backtrace_address, | ||
| 96 | .walk_stack = print_context_stack, | ||
| 97 | }; | ||
| 98 | |||
| 99 | static int | ||
| 100 | trace_kernel(struct pt_regs *regs, struct trace_array *tr, | ||
| 101 | struct trace_array_cpu *data) | ||
| 102 | { | ||
| 103 | struct backtrace_info info; | ||
| 104 | unsigned long bp; | ||
| 105 | char *stack; | ||
| 106 | |||
| 107 | info.tr = tr; | ||
| 108 | info.data = data; | ||
| 109 | info.pos = 1; | ||
| 110 | |||
| 111 | __trace_special(info.tr, info.data, 1, regs->ip, 0); | ||
| 112 | |||
| 113 | stack = ((char *)regs + sizeof(struct pt_regs)); | ||
| 114 | #ifdef CONFIG_FRAME_POINTER | ||
| 115 | bp = regs->bp; | ||
| 116 | #else | ||
| 117 | bp = 0; | ||
| 118 | #endif | ||
| 119 | |||
| 120 | dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); | ||
| 121 | |||
| 122 | return info.pos; | ||
| 123 | } | ||
| 124 | |||
| 125 | static void timer_notify(struct pt_regs *regs, int cpu) | ||
| 126 | { | ||
| 127 | struct trace_array_cpu *data; | ||
| 128 | struct stack_frame frame; | ||
| 129 | struct trace_array *tr; | ||
| 130 | const void __user *fp; | ||
| 131 | int is_user; | ||
| 132 | int i; | ||
| 133 | |||
| 134 | if (!regs) | ||
| 135 | return; | ||
| 136 | |||
| 137 | tr = sysprof_trace; | ||
| 138 | data = tr->data[cpu]; | ||
| 139 | is_user = user_mode(regs); | ||
| 140 | |||
| 141 | if (!current || current->pid == 0) | ||
| 142 | return; | ||
| 143 | |||
| 144 | if (is_user && current->state != TASK_RUNNING) | ||
| 145 | return; | ||
| 146 | |||
| 147 | __trace_special(tr, data, 0, 0, current->pid); | ||
| 148 | |||
| 149 | if (!is_user) | ||
| 150 | i = trace_kernel(regs, tr, data); | ||
| 151 | else | ||
| 152 | i = 0; | ||
| 153 | |||
| 154 | /* | ||
| 155 | * Trace user stack if we are not a kernel thread | ||
| 156 | */ | ||
| 157 | if (current->mm && i < sample_max_depth) { | ||
| 158 | regs = (struct pt_regs *)current->thread.sp0 - 1; | ||
| 159 | |||
| 160 | fp = (void __user *)regs->bp; | ||
| 161 | |||
| 162 | __trace_special(tr, data, 2, regs->ip, 0); | ||
| 163 | |||
| 164 | while (i < sample_max_depth) { | ||
| 165 | frame.next_fp = NULL; | ||
| 166 | frame.return_address = 0; | ||
| 167 | if (!copy_stack_frame(fp, &frame)) | ||
| 168 | break; | ||
| 169 | if ((unsigned long)fp < regs->sp) | ||
| 170 | break; | ||
| 171 | |||
| 172 | __trace_special(tr, data, 2, frame.return_address, | ||
| 173 | (unsigned long)fp); | ||
| 174 | fp = frame.next_fp; | ||
| 175 | |||
| 176 | i++; | ||
| 177 | } | ||
| 178 | |||
| 179 | } | ||
| 180 | |||
| 181 | /* | ||
| 182 | * Special trace entry if we overflow the max depth: | ||
| 183 | */ | ||
| 184 | if (i == sample_max_depth) | ||
| 185 | __trace_special(tr, data, -1, -1, -1); | ||
| 186 | |||
| 187 | __trace_special(tr, data, 3, current->pid, i); | ||
| 188 | } | ||
| 189 | |||
| 190 | static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) | ||
| 191 | { | ||
| 192 | /* trace here */ | ||
| 193 | timer_notify(get_irq_regs(), smp_processor_id()); | ||
| 194 | |||
| 195 | hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); | ||
| 196 | |||
| 197 | return HRTIMER_RESTART; | ||
| 198 | } | ||
| 199 | |||
| 200 | static void start_stack_timer(void *unused) | ||
| 201 | { | ||
| 202 | struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); | ||
| 203 | |||
| 204 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 205 | hrtimer->function = stack_trace_timer_fn; | ||
| 206 | |||
| 207 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
| 208 | HRTIMER_MODE_REL_PINNED); | ||
| 209 | } | ||
| 210 | |||
| 211 | static void start_stack_timers(void) | ||
| 212 | { | ||
| 213 | on_each_cpu(start_stack_timer, NULL, 1); | ||
| 214 | } | ||
| 215 | |||
| 216 | static void stop_stack_timer(int cpu) | ||
| 217 | { | ||
| 218 | struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); | ||
| 219 | |||
| 220 | hrtimer_cancel(hrtimer); | ||
| 221 | } | ||
| 222 | |||
| 223 | static void stop_stack_timers(void) | ||
| 224 | { | ||
| 225 | int cpu; | ||
| 226 | |||
| 227 | for_each_online_cpu(cpu) | ||
| 228 | stop_stack_timer(cpu); | ||
| 229 | } | ||
| 230 | |||
| 231 | static void stop_stack_trace(struct trace_array *tr) | ||
| 232 | { | ||
| 233 | mutex_lock(&sample_timer_lock); | ||
| 234 | stop_stack_timers(); | ||
| 235 | tracer_enabled = 0; | ||
| 236 | mutex_unlock(&sample_timer_lock); | ||
| 237 | } | ||
| 238 | |||
| 239 | static int stack_trace_init(struct trace_array *tr) | ||
| 240 | { | ||
| 241 | sysprof_trace = tr; | ||
| 242 | |||
| 243 | tracing_start_cmdline_record(); | ||
| 244 | |||
| 245 | mutex_lock(&sample_timer_lock); | ||
| 246 | start_stack_timers(); | ||
| 247 | tracer_enabled = 1; | ||
| 248 | mutex_unlock(&sample_timer_lock); | ||
| 249 | return 0; | ||
| 250 | } | ||
| 251 | |||
| 252 | static void stack_trace_reset(struct trace_array *tr) | ||
| 253 | { | ||
| 254 | tracing_stop_cmdline_record(); | ||
| 255 | stop_stack_trace(tr); | ||
| 256 | } | ||
| 257 | |||
| 258 | static struct tracer stack_trace __read_mostly = | ||
| 259 | { | ||
| 260 | .name = "sysprof", | ||
| 261 | .init = stack_trace_init, | ||
| 262 | .reset = stack_trace_reset, | ||
| 263 | #ifdef CONFIG_FTRACE_SELFTEST | ||
| 264 | .selftest = trace_selftest_startup_sysprof, | ||
| 265 | #endif | ||
| 266 | }; | ||
| 267 | |||
| 268 | __init static int init_stack_trace(void) | ||
| 269 | { | ||
| 270 | return register_tracer(&stack_trace); | ||
| 271 | } | ||
| 272 | device_initcall(init_stack_trace); | ||
| 273 | |||
| 274 | #define MAX_LONG_DIGITS 22 | ||
| 275 | |||
| 276 | static ssize_t | ||
| 277 | sysprof_sample_read(struct file *filp, char __user *ubuf, | ||
| 278 | size_t cnt, loff_t *ppos) | ||
| 279 | { | ||
| 280 | char buf[MAX_LONG_DIGITS]; | ||
| 281 | int r; | ||
| 282 | |||
| 283 | r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); | ||
| 284 | |||
| 285 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
| 286 | } | ||
| 287 | |||
| 288 | static ssize_t | ||
| 289 | sysprof_sample_write(struct file *filp, const char __user *ubuf, | ||
| 290 | size_t cnt, loff_t *ppos) | ||
| 291 | { | ||
| 292 | char buf[MAX_LONG_DIGITS]; | ||
| 293 | unsigned long val; | ||
| 294 | |||
| 295 | if (cnt > MAX_LONG_DIGITS-1) | ||
| 296 | cnt = MAX_LONG_DIGITS-1; | ||
| 297 | |||
| 298 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 299 | return -EFAULT; | ||
| 300 | |||
| 301 | buf[cnt] = 0; | ||
| 302 | |||
| 303 | val = simple_strtoul(buf, NULL, 10); | ||
| 304 | /* | ||
| 305 | * Enforce a minimum sample period of 100 usecs: | ||
| 306 | */ | ||
| 307 | if (val < 100) | ||
| 308 | val = 100; | ||
| 309 | |||
| 310 | mutex_lock(&sample_timer_lock); | ||
| 311 | stop_stack_timers(); | ||
| 312 | sample_period = val * 1000; | ||
| 313 | start_stack_timers(); | ||
| 314 | mutex_unlock(&sample_timer_lock); | ||
| 315 | |||
| 316 | return cnt; | ||
| 317 | } | ||
| 318 | |||
| 319 | static const struct file_operations sysprof_sample_fops = { | ||
| 320 | .read = sysprof_sample_read, | ||
| 321 | .write = sysprof_sample_write, | ||
| 322 | }; | ||
| 323 | |||
| 324 | void init_tracer_sysprof_debugfs(struct dentry *d_tracer) | ||
| 325 | { | ||
| 326 | |||
| 327 | trace_create_file("sysprof_sample_period", 0644, | ||
| 328 | d_tracer, NULL, &sysprof_sample_fops); | ||
| 329 | } | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 000000000000..613bc1f04610 --- /dev/null +++ b/kernel/watchdog.c | |||
| @@ -0,0 +1,567 @@ | |||
| 1 | /* | ||
| 2 | * Detect hard and soft lockups on a system | ||
| 3 | * | ||
| 4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
| 5 | * | ||
| 6 | * this code detects hard lockups: incidents in where on a CPU | ||
| 7 | * the kernel does not respond to anything except NMI. | ||
| 8 | * | ||
| 9 | * Note: Most of this code is borrowed heavily from softlockup.c, | ||
| 10 | * so thanks to Ingo for the initial implementation. | ||
| 11 | * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks | ||
| 12 | * to those contributors as well. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/mm.h> | ||
| 16 | #include <linux/cpu.h> | ||
| 17 | #include <linux/nmi.h> | ||
| 18 | #include <linux/init.h> | ||
| 19 | #include <linux/delay.h> | ||
| 20 | #include <linux/freezer.h> | ||
| 21 | #include <linux/kthread.h> | ||
| 22 | #include <linux/lockdep.h> | ||
| 23 | #include <linux/notifier.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/sysctl.h> | ||
| 26 | |||
| 27 | #include <asm/irq_regs.h> | ||
| 28 | #include <linux/perf_event.h> | ||
| 29 | |||
| 30 | int watchdog_enabled; | ||
| 31 | int __read_mostly softlockup_thresh = 60; | ||
| 32 | |||
| 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | ||
| 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | ||
| 35 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | ||
| 36 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | ||
| 37 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | ||
| 38 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 39 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
| 40 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
| 41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
| 42 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | ||
| 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
| 44 | #endif | ||
| 45 | |||
| 46 | static int __read_mostly did_panic; | ||
| 47 | static int __initdata no_watchdog; | ||
| 48 | |||
| 49 | |||
| 50 | /* boot commands */ | ||
| 51 | /* | ||
| 52 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 53 | */ | ||
| 54 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 55 | static int hardlockup_panic; | ||
| 56 | |||
| 57 | static int __init hardlockup_panic_setup(char *str) | ||
| 58 | { | ||
| 59 | if (!strncmp(str, "panic", 5)) | ||
| 60 | hardlockup_panic = 1; | ||
| 61 | return 1; | ||
| 62 | } | ||
| 63 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 64 | #endif | ||
| 65 | |||
| 66 | unsigned int __read_mostly softlockup_panic = | ||
| 67 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | ||
| 68 | |||
| 69 | static int __init softlockup_panic_setup(char *str) | ||
| 70 | { | ||
| 71 | softlockup_panic = simple_strtoul(str, NULL, 0); | ||
| 72 | |||
| 73 | return 1; | ||
| 74 | } | ||
| 75 | __setup("softlockup_panic=", softlockup_panic_setup); | ||
| 76 | |||
| 77 | static int __init nowatchdog_setup(char *str) | ||
| 78 | { | ||
| 79 | no_watchdog = 1; | ||
| 80 | return 1; | ||
| 81 | } | ||
| 82 | __setup("nowatchdog", nowatchdog_setup); | ||
| 83 | |||
| 84 | /* deprecated */ | ||
| 85 | static int __init nosoftlockup_setup(char *str) | ||
| 86 | { | ||
| 87 | no_watchdog = 1; | ||
| 88 | return 1; | ||
| 89 | } | ||
| 90 | __setup("nosoftlockup", nosoftlockup_setup); | ||
| 91 | /* */ | ||
| 92 | |||
| 93 | |||
| 94 | /* | ||
| 95 | * Returns seconds, approximately. We don't need nanosecond | ||
| 96 | * resolution, and we don't need to waste time with a big divide when | ||
| 97 | * 2^30ns == 1.074s. | ||
| 98 | */ | ||
| 99 | static unsigned long get_timestamp(int this_cpu) | ||
| 100 | { | ||
| 101 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ | ||
| 102 | } | ||
| 103 | |||
| 104 | static unsigned long get_sample_period(void) | ||
| 105 | { | ||
| 106 | /* | ||
| 107 | * convert softlockup_thresh from seconds to ns | ||
| 108 | * the divide by 5 is to give hrtimer 5 chances to | ||
| 109 | * increment before the hardlockup detector generates | ||
| 110 | * a warning | ||
| 111 | */ | ||
| 112 | return softlockup_thresh / 5 * NSEC_PER_SEC; | ||
| 113 | } | ||
| 114 | |||
| 115 | /* Commands for resetting the watchdog */ | ||
| 116 | static void __touch_watchdog(void) | ||
| 117 | { | ||
| 118 | int this_cpu = smp_processor_id(); | ||
| 119 | |||
| 120 | __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); | ||
| 121 | } | ||
| 122 | |||
| 123 | void touch_softlockup_watchdog(void) | ||
| 124 | { | ||
| 125 | __get_cpu_var(watchdog_touch_ts) = 0; | ||
| 126 | } | ||
| 127 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
| 128 | |||
| 129 | void touch_all_softlockup_watchdogs(void) | ||
| 130 | { | ||
| 131 | int cpu; | ||
| 132 | |||
| 133 | /* | ||
| 134 | * this is done lockless | ||
| 135 | * do we care if a 0 races with a timestamp? | ||
| 136 | * all it means is the softlock check starts one cycle later | ||
| 137 | */ | ||
| 138 | for_each_online_cpu(cpu) | ||
| 139 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
| 140 | } | ||
| 141 | |||
| 142 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 143 | void touch_nmi_watchdog(void) | ||
| 144 | { | ||
| 145 | __get_cpu_var(watchdog_nmi_touch) = true; | ||
| 146 | touch_softlockup_watchdog(); | ||
| 147 | } | ||
| 148 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
| 149 | |||
| 150 | #endif | ||
| 151 | |||
| 152 | void touch_softlockup_watchdog_sync(void) | ||
| 153 | { | ||
| 154 | __raw_get_cpu_var(softlockup_touch_sync) = true; | ||
| 155 | __raw_get_cpu_var(watchdog_touch_ts) = 0; | ||
| 156 | } | ||
| 157 | |||
| 158 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 159 | /* watchdog detector functions */ | ||
| 160 | static int is_hardlockup(void) | ||
| 161 | { | ||
| 162 | unsigned long hrint = __get_cpu_var(hrtimer_interrupts); | ||
| 163 | |||
| 164 | if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) | ||
| 165 | return 1; | ||
| 166 | |||
| 167 | __get_cpu_var(hrtimer_interrupts_saved) = hrint; | ||
| 168 | return 0; | ||
| 169 | } | ||
| 170 | #endif | ||
| 171 | |||
| 172 | static int is_softlockup(unsigned long touch_ts) | ||
| 173 | { | ||
| 174 | unsigned long now = get_timestamp(smp_processor_id()); | ||
| 175 | |||
| 176 | /* Warn about unreasonable delays: */ | ||
| 177 | if (time_after(now, touch_ts + softlockup_thresh)) | ||
| 178 | return now - touch_ts; | ||
| 179 | |||
| 180 | return 0; | ||
| 181 | } | ||
| 182 | |||
| 183 | static int | ||
| 184 | watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) | ||
| 185 | { | ||
| 186 | did_panic = 1; | ||
| 187 | |||
| 188 | return NOTIFY_DONE; | ||
| 189 | } | ||
| 190 | |||
| 191 | static struct notifier_block panic_block = { | ||
| 192 | .notifier_call = watchdog_panic, | ||
| 193 | }; | ||
| 194 | |||
| 195 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 196 | static struct perf_event_attr wd_hw_attr = { | ||
| 197 | .type = PERF_TYPE_HARDWARE, | ||
| 198 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
| 199 | .size = sizeof(struct perf_event_attr), | ||
| 200 | .pinned = 1, | ||
| 201 | .disabled = 1, | ||
| 202 | }; | ||
| 203 | |||
| 204 | /* Callback function for perf event subsystem */ | ||
| 205 | void watchdog_overflow_callback(struct perf_event *event, int nmi, | ||
| 206 | struct perf_sample_data *data, | ||
| 207 | struct pt_regs *regs) | ||
| 208 | { | ||
| 209 | if (__get_cpu_var(watchdog_nmi_touch) == true) { | ||
| 210 | __get_cpu_var(watchdog_nmi_touch) = false; | ||
| 211 | return; | ||
| 212 | } | ||
| 213 | |||
| 214 | /* check for a hardlockup | ||
| 215 | * This is done by making sure our timer interrupt | ||
| 216 | * is incrementing. The timer interrupt should have | ||
| 217 | * fired multiple times before we overflow'd. If it hasn't | ||
| 218 | * then this is a good indication the cpu is stuck | ||
| 219 | */ | ||
| 220 | if (is_hardlockup()) { | ||
| 221 | int this_cpu = smp_processor_id(); | ||
| 222 | |||
| 223 | /* only print hardlockups once */ | ||
| 224 | if (__get_cpu_var(hard_watchdog_warn) == true) | ||
| 225 | return; | ||
| 226 | |||
| 227 | if (hardlockup_panic) | ||
| 228 | panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
| 229 | else | ||
| 230 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
| 231 | |||
| 232 | __get_cpu_var(hard_watchdog_warn) = true; | ||
| 233 | return; | ||
| 234 | } | ||
| 235 | |||
| 236 | __get_cpu_var(hard_watchdog_warn) = false; | ||
| 237 | return; | ||
| 238 | } | ||
| 239 | static void watchdog_interrupt_count(void) | ||
| 240 | { | ||
| 241 | __get_cpu_var(hrtimer_interrupts)++; | ||
| 242 | } | ||
| 243 | #else | ||
| 244 | static inline void watchdog_interrupt_count(void) { return; } | ||
| 245 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
| 246 | |||
| 247 | /* watchdog kicker functions */ | ||
| 248 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | ||
| 249 | { | ||
| 250 | unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); | ||
| 251 | struct pt_regs *regs = get_irq_regs(); | ||
| 252 | int duration; | ||
| 253 | |||
| 254 | /* kick the hardlockup detector */ | ||
| 255 | watchdog_interrupt_count(); | ||
| 256 | |||
| 257 | /* kick the softlockup detector */ | ||
| 258 | wake_up_process(__get_cpu_var(softlockup_watchdog)); | ||
| 259 | |||
| 260 | /* .. and repeat */ | ||
| 261 | hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); | ||
| 262 | |||
| 263 | if (touch_ts == 0) { | ||
| 264 | if (unlikely(__get_cpu_var(softlockup_touch_sync))) { | ||
| 265 | /* | ||
| 266 | * If the time stamp was touched atomically | ||
| 267 | * make sure the scheduler tick is up to date. | ||
| 268 | */ | ||
| 269 | __get_cpu_var(softlockup_touch_sync) = false; | ||
| 270 | sched_clock_tick(); | ||
| 271 | } | ||
| 272 | __touch_watchdog(); | ||
| 273 | return HRTIMER_RESTART; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* check for a softlockup | ||
| 277 | * This is done by making sure a high priority task is | ||
| 278 | * being scheduled. The task touches the watchdog to | ||
| 279 | * indicate it is getting cpu time. If it hasn't then | ||
| 280 | * this is a good indication some task is hogging the cpu | ||
| 281 | */ | ||
| 282 | duration = is_softlockup(touch_ts); | ||
| 283 | if (unlikely(duration)) { | ||
| 284 | /* only warn once */ | ||
| 285 | if (__get_cpu_var(soft_watchdog_warn) == true) | ||
| 286 | return HRTIMER_RESTART; | ||
| 287 | |||
| 288 | printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | ||
| 289 | smp_processor_id(), duration, | ||
| 290 | current->comm, task_pid_nr(current)); | ||
| 291 | print_modules(); | ||
| 292 | print_irqtrace_events(current); | ||
| 293 | if (regs) | ||
| 294 | show_regs(regs); | ||
| 295 | else | ||
| 296 | dump_stack(); | ||
| 297 | |||
| 298 | if (softlockup_panic) | ||
| 299 | panic("softlockup: hung tasks"); | ||
| 300 | __get_cpu_var(soft_watchdog_warn) = true; | ||
| 301 | } else | ||
| 302 | __get_cpu_var(soft_watchdog_warn) = false; | ||
| 303 | |||
| 304 | return HRTIMER_RESTART; | ||
| 305 | } | ||
| 306 | |||
| 307 | |||
| 308 | /* | ||
| 309 | * The watchdog thread - touches the timestamp. | ||
| 310 | */ | ||
| 311 | static int watchdog(void *unused) | ||
| 312 | { | ||
| 313 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
| 314 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
| 315 | |||
| 316 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
| 317 | |||
| 318 | /* initialize timestamp */ | ||
| 319 | __touch_watchdog(); | ||
| 320 | |||
| 321 | /* kick off the timer for the hardlockup detector */ | ||
| 322 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | ||
| 323 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | ||
| 324 | HRTIMER_MODE_REL_PINNED); | ||
| 325 | |||
| 326 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 327 | /* | ||
| 328 | * Run briefly once per second to reset the softlockup timestamp. | ||
| 329 | * If this gets delayed for more than 60 seconds then the | ||
| 330 | * debug-printout triggers in watchdog_timer_fn(). | ||
| 331 | */ | ||
| 332 | while (!kthread_should_stop()) { | ||
| 333 | __touch_watchdog(); | ||
| 334 | schedule(); | ||
| 335 | |||
| 336 | if (kthread_should_stop()) | ||
| 337 | break; | ||
| 338 | |||
| 339 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 340 | } | ||
| 341 | __set_current_state(TASK_RUNNING); | ||
| 342 | |||
| 343 | return 0; | ||
| 344 | } | ||
| 345 | |||
| 346 | |||
| 347 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 348 | static int watchdog_nmi_enable(int cpu) | ||
| 349 | { | ||
| 350 | struct perf_event_attr *wd_attr; | ||
| 351 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 352 | |||
| 353 | /* is it already setup and enabled? */ | ||
| 354 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
| 355 | goto out; | ||
| 356 | |||
| 357 | /* it is setup but not enabled */ | ||
| 358 | if (event != NULL) | ||
| 359 | goto out_enable; | ||
| 360 | |||
| 361 | /* Try to register using hardware perf events */ | ||
| 362 | wd_attr = &wd_hw_attr; | ||
| 363 | wd_attr->sample_period = hw_nmi_get_sample_period(); | ||
| 364 | event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); | ||
| 365 | if (!IS_ERR(event)) { | ||
| 366 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | ||
| 367 | goto out_save; | ||
| 368 | } | ||
| 369 | |||
| 370 | printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); | ||
| 371 | return -1; | ||
| 372 | |||
| 373 | /* success path */ | ||
| 374 | out_save: | ||
| 375 | per_cpu(watchdog_ev, cpu) = event; | ||
| 376 | out_enable: | ||
| 377 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
| 378 | out: | ||
| 379 | return 0; | ||
| 380 | } | ||
| 381 | |||
| 382 | static void watchdog_nmi_disable(int cpu) | ||
| 383 | { | ||
| 384 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 385 | |||
| 386 | if (event) { | ||
| 387 | perf_event_disable(event); | ||
| 388 | per_cpu(watchdog_ev, cpu) = NULL; | ||
| 389 | |||
| 390 | /* should be in cleanup, but blocks oprofile */ | ||
| 391 | perf_event_release_kernel(event); | ||
| 392 | } | ||
| 393 | return; | ||
| 394 | } | ||
| 395 | #else | ||
| 396 | static int watchdog_nmi_enable(int cpu) { return 0; } | ||
| 397 | static void watchdog_nmi_disable(int cpu) { return; } | ||
| 398 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
| 399 | |||
| 400 | /* prepare/enable/disable routines */ | ||
| 401 | static int watchdog_prepare_cpu(int cpu) | ||
| 402 | { | ||
| 403 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
| 404 | |||
| 405 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | ||
| 406 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 407 | hrtimer->function = watchdog_timer_fn; | ||
| 408 | |||
| 409 | return 0; | ||
| 410 | } | ||
| 411 | |||
| 412 | static int watchdog_enable(int cpu) | ||
| 413 | { | ||
| 414 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
| 415 | |||
| 416 | /* enable the perf event */ | ||
| 417 | if (watchdog_nmi_enable(cpu) != 0) | ||
| 418 | return -1; | ||
| 419 | |||
| 420 | /* create the watchdog thread */ | ||
| 421 | if (!p) { | ||
| 422 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | ||
| 423 | if (IS_ERR(p)) { | ||
| 424 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | ||
| 425 | return -1; | ||
| 426 | } | ||
| 427 | kthread_bind(p, cpu); | ||
| 428 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
| 429 | per_cpu(softlockup_watchdog, cpu) = p; | ||
| 430 | wake_up_process(p); | ||
| 431 | } | ||
| 432 | |||
| 433 | return 0; | ||
| 434 | } | ||
| 435 | |||
| 436 | static void watchdog_disable(int cpu) | ||
| 437 | { | ||
| 438 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
| 439 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
| 440 | |||
| 441 | /* | ||
| 442 | * cancel the timer first to stop incrementing the stats | ||
| 443 | * and waking up the kthread | ||
| 444 | */ | ||
| 445 | hrtimer_cancel(hrtimer); | ||
| 446 | |||
| 447 | /* disable the perf event */ | ||
| 448 | watchdog_nmi_disable(cpu); | ||
| 449 | |||
| 450 | /* stop the watchdog thread */ | ||
| 451 | if (p) { | ||
| 452 | per_cpu(softlockup_watchdog, cpu) = NULL; | ||
| 453 | kthread_stop(p); | ||
| 454 | } | ||
| 455 | |||
| 456 | /* if any cpu succeeds, watchdog is considered enabled for the system */ | ||
| 457 | watchdog_enabled = 1; | ||
| 458 | } | ||
| 459 | |||
| 460 | static void watchdog_enable_all_cpus(void) | ||
| 461 | { | ||
| 462 | int cpu; | ||
| 463 | int result = 0; | ||
| 464 | |||
| 465 | for_each_online_cpu(cpu) | ||
| 466 | result += watchdog_enable(cpu); | ||
| 467 | |||
| 468 | if (result) | ||
| 469 | printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); | ||
| 470 | |||
| 471 | } | ||
| 472 | |||
| 473 | static void watchdog_disable_all_cpus(void) | ||
| 474 | { | ||
| 475 | int cpu; | ||
| 476 | |||
| 477 | for_each_online_cpu(cpu) | ||
| 478 | watchdog_disable(cpu); | ||
| 479 | |||
| 480 | /* if all watchdogs are disabled, then they are disabled for the system */ | ||
| 481 | watchdog_enabled = 0; | ||
| 482 | } | ||
| 483 | |||
| 484 | |||
| 485 | /* sysctl functions */ | ||
| 486 | #ifdef CONFIG_SYSCTL | ||
| 487 | /* | ||
| 488 | * proc handler for /proc/sys/kernel/nmi_watchdog | ||
| 489 | */ | ||
| 490 | |||
| 491 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | ||
| 492 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 493 | { | ||
| 494 | proc_dointvec(table, write, buffer, length, ppos); | ||
| 495 | |||
| 496 | if (watchdog_enabled) | ||
| 497 | watchdog_enable_all_cpus(); | ||
| 498 | else | ||
| 499 | watchdog_disable_all_cpus(); | ||
| 500 | return 0; | ||
| 501 | } | ||
| 502 | |||
| 503 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | ||
| 504 | void __user *buffer, | ||
| 505 | size_t *lenp, loff_t *ppos) | ||
| 506 | { | ||
| 507 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 508 | } | ||
| 509 | #endif /* CONFIG_SYSCTL */ | ||
| 510 | |||
| 511 | |||
| 512 | /* | ||
| 513 | * Create/destroy watchdog threads as CPUs come and go: | ||
| 514 | */ | ||
| 515 | static int __cpuinit | ||
| 516 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
| 517 | { | ||
| 518 | int hotcpu = (unsigned long)hcpu; | ||
| 519 | |||
| 520 | switch (action) { | ||
| 521 | case CPU_UP_PREPARE: | ||
| 522 | case CPU_UP_PREPARE_FROZEN: | ||
| 523 | if (watchdog_prepare_cpu(hotcpu)) | ||
| 524 | return NOTIFY_BAD; | ||
| 525 | break; | ||
| 526 | case CPU_ONLINE: | ||
| 527 | case CPU_ONLINE_FROZEN: | ||
| 528 | if (watchdog_enable(hotcpu)) | ||
| 529 | return NOTIFY_BAD; | ||
| 530 | break; | ||
| 531 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 532 | case CPU_UP_CANCELED: | ||
| 533 | case CPU_UP_CANCELED_FROZEN: | ||
| 534 | watchdog_disable(hotcpu); | ||
| 535 | break; | ||
| 536 | case CPU_DEAD: | ||
| 537 | case CPU_DEAD_FROZEN: | ||
| 538 | watchdog_disable(hotcpu); | ||
| 539 | break; | ||
| 540 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 541 | } | ||
| 542 | return NOTIFY_OK; | ||
| 543 | } | ||
| 544 | |||
| 545 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
| 546 | .notifier_call = cpu_callback | ||
| 547 | }; | ||
| 548 | |||
| 549 | static int __init spawn_watchdog_task(void) | ||
| 550 | { | ||
| 551 | void *cpu = (void *)(long)smp_processor_id(); | ||
| 552 | int err; | ||
| 553 | |||
| 554 | if (no_watchdog) | ||
| 555 | return 0; | ||
| 556 | |||
| 557 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
| 558 | WARN_ON(err == NOTIFY_BAD); | ||
| 559 | |||
| 560 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
| 561 | register_cpu_notifier(&cpu_nfb); | ||
| 562 | |||
| 563 | atomic_notifier_chain_register(&panic_notifier_list, &panic_block); | ||
| 564 | |||
| 565 | return 0; | ||
| 566 | } | ||
| 567 | early_initcall(spawn_watchdog_task); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb4451..9ca34cddaf6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -33,41 +33,287 @@ | |||
| 33 | #include <linux/kallsyms.h> | 33 | #include <linux/kallsyms.h> |
| 34 | #include <linux/debug_locks.h> | 34 | #include <linux/debug_locks.h> |
| 35 | #include <linux/lockdep.h> | 35 | #include <linux/lockdep.h> |
| 36 | #define CREATE_TRACE_POINTS | 36 | #include <linux/idr.h> |
| 37 | #include <trace/events/workqueue.h> | 37 | |
| 38 | #include "workqueue_sched.h" | ||
| 39 | |||
| 40 | enum { | ||
| 41 | /* global_cwq flags */ | ||
| 42 | GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
| 43 | GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ | ||
| 44 | GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | ||
| 45 | GCWQ_FREEZING = 1 << 3, /* freeze in progress */ | ||
| 46 | GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ | ||
| 47 | |||
| 48 | /* worker flags */ | ||
| 49 | WORKER_STARTED = 1 << 0, /* started */ | ||
| 50 | WORKER_DIE = 1 << 1, /* die die die */ | ||
| 51 | WORKER_IDLE = 1 << 2, /* is idle */ | ||
| 52 | WORKER_PREP = 1 << 3, /* preparing to run works */ | ||
| 53 | WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ | ||
| 54 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | ||
| 55 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | ||
| 56 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | ||
| 57 | |||
| 58 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | | ||
| 59 | WORKER_CPU_INTENSIVE | WORKER_UNBOUND, | ||
| 60 | |||
| 61 | /* gcwq->trustee_state */ | ||
| 62 | TRUSTEE_START = 0, /* start */ | ||
| 63 | TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ | ||
| 64 | TRUSTEE_BUTCHER = 2, /* butcher workers */ | ||
| 65 | TRUSTEE_RELEASE = 3, /* release workers */ | ||
| 66 | TRUSTEE_DONE = 4, /* trustee is done */ | ||
| 67 | |||
| 68 | BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ | ||
| 69 | BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, | ||
| 70 | BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, | ||
| 71 | |||
| 72 | MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ | ||
| 73 | IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ | ||
| 74 | |||
| 75 | MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ | ||
| 76 | MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ | ||
| 77 | CREATE_COOLDOWN = HZ, /* time to breath after fail */ | ||
| 78 | TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Rescue workers are used only on emergencies and shared by | ||
| 82 | * all cpus. Give -20. | ||
| 83 | */ | ||
| 84 | RESCUER_NICE_LEVEL = -20, | ||
| 85 | }; | ||
| 38 | 86 | ||
| 39 | /* | 87 | /* |
| 40 | * The per-CPU workqueue (if single thread, we always use the first | 88 | * Structure fields follow one of the following exclusion rules. |
| 41 | * possible cpu). | 89 | * |
| 90 | * I: Set during initialization and read-only afterwards. | ||
| 91 | * | ||
| 92 | * P: Preemption protected. Disabling preemption is enough and should | ||
| 93 | * only be modified and accessed from the local cpu. | ||
| 94 | * | ||
| 95 | * L: gcwq->lock protected. Access with gcwq->lock held. | ||
| 96 | * | ||
| 97 | * X: During normal operation, modification requires gcwq->lock and | ||
| 98 | * should be done only from local cpu. Either disabling preemption | ||
| 99 | * on local cpu or grabbing gcwq->lock is enough for read access. | ||
| 100 | * If GCWQ_DISASSOCIATED is set, it's identical to L. | ||
| 101 | * | ||
| 102 | * F: wq->flush_mutex protected. | ||
| 103 | * | ||
| 104 | * W: workqueue_lock protected. | ||
| 42 | */ | 105 | */ |
| 43 | struct cpu_workqueue_struct { | ||
| 44 | 106 | ||
| 45 | spinlock_t lock; | 107 | struct global_cwq; |
| 46 | 108 | ||
| 47 | struct list_head worklist; | 109 | /* |
| 48 | wait_queue_head_t more_work; | 110 | * The poor guys doing the actual heavy lifting. All on-duty workers |
| 49 | struct work_struct *current_work; | 111 | * are either serving the manager role, on idle list or on busy hash. |
| 112 | */ | ||
| 113 | struct worker { | ||
| 114 | /* on idle list while idle, on busy hash table while busy */ | ||
| 115 | union { | ||
| 116 | struct list_head entry; /* L: while idle */ | ||
| 117 | struct hlist_node hentry; /* L: while busy */ | ||
| 118 | }; | ||
| 50 | 119 | ||
| 51 | struct workqueue_struct *wq; | 120 | struct work_struct *current_work; /* L: work being processed */ |
| 52 | struct task_struct *thread; | 121 | struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ |
| 53 | } ____cacheline_aligned; | 122 | struct list_head scheduled; /* L: scheduled works */ |
| 123 | struct task_struct *task; /* I: worker task */ | ||
| 124 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
| 125 | /* 64 bytes boundary on 64bit, 32 on 32bit */ | ||
| 126 | unsigned long last_active; /* L: last active timestamp */ | ||
| 127 | unsigned int flags; /* X: flags */ | ||
| 128 | int id; /* I: worker id */ | ||
| 129 | struct work_struct rebind_work; /* L: rebind worker to cpu */ | ||
| 130 | }; | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Global per-cpu workqueue. There's one and only one for each cpu | ||
| 134 | * and all works are queued and processed here regardless of their | ||
| 135 | * target workqueues. | ||
| 136 | */ | ||
| 137 | struct global_cwq { | ||
| 138 | spinlock_t lock; /* the gcwq lock */ | ||
| 139 | struct list_head worklist; /* L: list of pending works */ | ||
| 140 | unsigned int cpu; /* I: the associated cpu */ | ||
| 141 | unsigned int flags; /* L: GCWQ_* flags */ | ||
| 142 | |||
| 143 | int nr_workers; /* L: total number of workers */ | ||
| 144 | int nr_idle; /* L: currently idle ones */ | ||
| 145 | |||
| 146 | /* workers are chained either in the idle_list or busy_hash */ | ||
| 147 | struct list_head idle_list; /* X: list of idle workers */ | ||
| 148 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | ||
| 149 | /* L: hash of busy workers */ | ||
| 150 | |||
| 151 | struct timer_list idle_timer; /* L: worker idle timeout */ | ||
| 152 | struct timer_list mayday_timer; /* L: SOS timer for dworkers */ | ||
| 153 | |||
| 154 | struct ida worker_ida; /* L: for worker IDs */ | ||
| 155 | |||
| 156 | struct task_struct *trustee; /* L: for gcwq shutdown */ | ||
| 157 | unsigned int trustee_state; /* L: trustee state */ | ||
| 158 | wait_queue_head_t trustee_wait; /* trustee wait */ | ||
| 159 | struct worker *first_idle; /* L: first idle worker */ | ||
| 160 | } ____cacheline_aligned_in_smp; | ||
| 161 | |||
| 162 | /* | ||
| 163 | * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of | ||
| 164 | * work_struct->data are used for flags and thus cwqs need to be | ||
| 165 | * aligned at two's power of the number of flag bits. | ||
| 166 | */ | ||
| 167 | struct cpu_workqueue_struct { | ||
| 168 | struct global_cwq *gcwq; /* I: the associated gcwq */ | ||
| 169 | struct workqueue_struct *wq; /* I: the owning workqueue */ | ||
| 170 | int work_color; /* L: current color */ | ||
| 171 | int flush_color; /* L: flushing color */ | ||
| 172 | int nr_in_flight[WORK_NR_COLORS]; | ||
| 173 | /* L: nr of in_flight works */ | ||
| 174 | int nr_active; /* L: nr of active works */ | ||
| 175 | int max_active; /* L: max active works */ | ||
| 176 | struct list_head delayed_works; /* L: delayed works */ | ||
| 177 | }; | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Structure used to wait for workqueue flush. | ||
| 181 | */ | ||
| 182 | struct wq_flusher { | ||
| 183 | struct list_head list; /* F: list of flushers */ | ||
| 184 | int flush_color; /* F: flush color waiting for */ | ||
| 185 | struct completion done; /* flush completion */ | ||
| 186 | }; | ||
| 187 | |||
| 188 | /* | ||
| 189 | * All cpumasks are assumed to be always set on UP and thus can't be | ||
| 190 | * used to determine whether there's something to be done. | ||
| 191 | */ | ||
| 192 | #ifdef CONFIG_SMP | ||
| 193 | typedef cpumask_var_t mayday_mask_t; | ||
| 194 | #define mayday_test_and_set_cpu(cpu, mask) \ | ||
| 195 | cpumask_test_and_set_cpu((cpu), (mask)) | ||
| 196 | #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) | ||
| 197 | #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) | ||
| 198 | #define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) | ||
| 199 | #define free_mayday_mask(mask) free_cpumask_var((mask)) | ||
| 200 | #else | ||
| 201 | typedef unsigned long mayday_mask_t; | ||
| 202 | #define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) | ||
| 203 | #define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) | ||
| 204 | #define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) | ||
| 205 | #define alloc_mayday_mask(maskp, gfp) true | ||
| 206 | #define free_mayday_mask(mask) do { } while (0) | ||
| 207 | #endif | ||
| 54 | 208 | ||
| 55 | /* | 209 | /* |
| 56 | * The externally visible workqueue abstraction is an array of | 210 | * The externally visible workqueue abstraction is an array of |
| 57 | * per-CPU workqueues: | 211 | * per-CPU workqueues: |
| 58 | */ | 212 | */ |
| 59 | struct workqueue_struct { | 213 | struct workqueue_struct { |
| 60 | struct cpu_workqueue_struct *cpu_wq; | 214 | unsigned int flags; /* I: WQ_* flags */ |
| 61 | struct list_head list; | 215 | union { |
| 62 | const char *name; | 216 | struct cpu_workqueue_struct __percpu *pcpu; |
| 63 | int singlethread; | 217 | struct cpu_workqueue_struct *single; |
| 64 | int freezeable; /* Freeze threads during suspend */ | 218 | unsigned long v; |
| 65 | int rt; | 219 | } cpu_wq; /* I: cwq's */ |
| 220 | struct list_head list; /* W: list of all workqueues */ | ||
| 221 | |||
| 222 | struct mutex flush_mutex; /* protects wq flushing */ | ||
| 223 | int work_color; /* F: current work color */ | ||
| 224 | int flush_color; /* F: current flush color */ | ||
| 225 | atomic_t nr_cwqs_to_flush; /* flush in progress */ | ||
| 226 | struct wq_flusher *first_flusher; /* F: first flusher */ | ||
| 227 | struct list_head flusher_queue; /* F: flush waiters */ | ||
| 228 | struct list_head flusher_overflow; /* F: flush overflow list */ | ||
| 229 | |||
| 230 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | ||
| 231 | struct worker *rescuer; /* I: rescue worker */ | ||
| 232 | |||
| 233 | int saved_max_active; /* W: saved cwq max_active */ | ||
| 234 | const char *name; /* I: workqueue name */ | ||
| 66 | #ifdef CONFIG_LOCKDEP | 235 | #ifdef CONFIG_LOCKDEP |
| 67 | struct lockdep_map lockdep_map; | 236 | struct lockdep_map lockdep_map; |
| 68 | #endif | 237 | #endif |
| 69 | }; | 238 | }; |
| 70 | 239 | ||
| 240 | struct workqueue_struct *system_wq __read_mostly; | ||
| 241 | struct workqueue_struct *system_long_wq __read_mostly; | ||
| 242 | struct workqueue_struct *system_nrt_wq __read_mostly; | ||
| 243 | struct workqueue_struct *system_unbound_wq __read_mostly; | ||
| 244 | EXPORT_SYMBOL_GPL(system_wq); | ||
| 245 | EXPORT_SYMBOL_GPL(system_long_wq); | ||
| 246 | EXPORT_SYMBOL_GPL(system_nrt_wq); | ||
| 247 | EXPORT_SYMBOL_GPL(system_unbound_wq); | ||
| 248 | |||
| 249 | #define for_each_busy_worker(worker, i, pos, gcwq) \ | ||
| 250 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ | ||
| 251 | hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) | ||
| 252 | |||
| 253 | static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, | ||
| 254 | unsigned int sw) | ||
| 255 | { | ||
| 256 | if (cpu < nr_cpu_ids) { | ||
| 257 | if (sw & 1) { | ||
| 258 | cpu = cpumask_next(cpu, mask); | ||
| 259 | if (cpu < nr_cpu_ids) | ||
| 260 | return cpu; | ||
| 261 | } | ||
| 262 | if (sw & 2) | ||
| 263 | return WORK_CPU_UNBOUND; | ||
| 264 | } | ||
| 265 | return WORK_CPU_NONE; | ||
| 266 | } | ||
| 267 | |||
| 268 | static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, | ||
| 269 | struct workqueue_struct *wq) | ||
| 270 | { | ||
| 271 | return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); | ||
| 272 | } | ||
| 273 | |||
| 274 | /* | ||
| 275 | * CPU iterators | ||
| 276 | * | ||
| 277 | * An extra gcwq is defined for an invalid cpu number | ||
| 278 | * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any | ||
| 279 | * specific CPU. The following iterators are similar to | ||
| 280 | * for_each_*_cpu() iterators but also considers the unbound gcwq. | ||
| 281 | * | ||
| 282 | * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND | ||
| 283 | * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND | ||
| 284 | * for_each_cwq_cpu() : possible CPUs for bound workqueues, | ||
| 285 | * WORK_CPU_UNBOUND for unbound workqueues | ||
| 286 | */ | ||
| 287 | #define for_each_gcwq_cpu(cpu) \ | ||
| 288 | for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ | ||
| 289 | (cpu) < WORK_CPU_NONE; \ | ||
| 290 | (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) | ||
| 291 | |||
| 292 | #define for_each_online_gcwq_cpu(cpu) \ | ||
| 293 | for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ | ||
| 294 | (cpu) < WORK_CPU_NONE; \ | ||
| 295 | (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) | ||
| 296 | |||
| 297 | #define for_each_cwq_cpu(cpu, wq) \ | ||
| 298 | for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ | ||
| 299 | (cpu) < WORK_CPU_NONE; \ | ||
| 300 | (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) | ||
| 301 | |||
| 302 | #ifdef CONFIG_LOCKDEP | ||
| 303 | /** | ||
| 304 | * in_workqueue_context() - in context of specified workqueue? | ||
| 305 | * @wq: the workqueue of interest | ||
| 306 | * | ||
| 307 | * Checks lockdep state to see if the current task is executing from | ||
| 308 | * within a workqueue item. This function exists only if lockdep is | ||
| 309 | * enabled. | ||
| 310 | */ | ||
| 311 | int in_workqueue_context(struct workqueue_struct *wq) | ||
| 312 | { | ||
| 313 | return lock_is_held(&wq->lockdep_map); | ||
| 314 | } | ||
| 315 | #endif | ||
| 316 | |||
| 71 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | 317 | #ifdef CONFIG_DEBUG_OBJECTS_WORK |
| 72 | 318 | ||
| 73 | static struct debug_obj_descr work_debug_descr; | 319 | static struct debug_obj_descr work_debug_descr; |
| @@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) | |||
| 107 | * statically initialized. We just make sure that it | 353 | * statically initialized. We just make sure that it |
| 108 | * is tracked in the object tracker. | 354 | * is tracked in the object tracker. |
| 109 | */ | 355 | */ |
| 110 | if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { | 356 | if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { |
| 111 | debug_object_init(work, &work_debug_descr); | 357 | debug_object_init(work, &work_debug_descr); |
| 112 | debug_object_activate(work, &work_debug_descr); | 358 | debug_object_activate(work, &work_debug_descr); |
| 113 | return 0; | 359 | return 0; |
| @@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { } | |||
| 181 | /* Serializes the accesses to the list of workqueues. */ | 427 | /* Serializes the accesses to the list of workqueues. */ |
| 182 | static DEFINE_SPINLOCK(workqueue_lock); | 428 | static DEFINE_SPINLOCK(workqueue_lock); |
| 183 | static LIST_HEAD(workqueues); | 429 | static LIST_HEAD(workqueues); |
| 430 | static bool workqueue_freezing; /* W: have wqs started freezing? */ | ||
| 431 | |||
| 432 | /* | ||
| 433 | * The almighty global cpu workqueues. nr_running is the only field | ||
| 434 | * which is expected to be used frequently by other cpus via | ||
| 435 | * try_to_wake_up(). Put it in a separate cacheline. | ||
| 436 | */ | ||
| 437 | static DEFINE_PER_CPU(struct global_cwq, global_cwq); | ||
| 438 | static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); | ||
| 184 | 439 | ||
| 185 | static int singlethread_cpu __read_mostly; | ||
| 186 | static const struct cpumask *cpu_singlethread_map __read_mostly; | ||
| 187 | /* | 440 | /* |
| 188 | * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD | 441 | * Global cpu workqueue and nr_running counter for unbound gcwq. The |
| 189 | * flushes cwq->worklist. This means that flush_workqueue/wait_on_work | 442 | * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its |
| 190 | * which comes in between can't use for_each_online_cpu(). We could | 443 | * workers have WORKER_UNBOUND set. |
| 191 | * use cpu_possible_map, the cpumask below is more a documentation | ||
| 192 | * than optimization. | ||
| 193 | */ | 444 | */ |
| 194 | static cpumask_var_t cpu_populated_map __read_mostly; | 445 | static struct global_cwq unbound_global_cwq; |
| 446 | static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ | ||
| 195 | 447 | ||
| 196 | /* If it's single threaded, it isn't in the list of workqueues. */ | 448 | static int worker_thread(void *__worker); |
| 197 | static inline int is_wq_single_threaded(struct workqueue_struct *wq) | 449 | |
| 450 | static struct global_cwq *get_gcwq(unsigned int cpu) | ||
| 198 | { | 451 | { |
| 199 | return wq->singlethread; | 452 | if (cpu != WORK_CPU_UNBOUND) |
| 453 | return &per_cpu(global_cwq, cpu); | ||
| 454 | else | ||
| 455 | return &unbound_global_cwq; | ||
| 200 | } | 456 | } |
| 201 | 457 | ||
| 202 | static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) | 458 | static atomic_t *get_gcwq_nr_running(unsigned int cpu) |
| 203 | { | 459 | { |
| 204 | return is_wq_single_threaded(wq) | 460 | if (cpu != WORK_CPU_UNBOUND) |
| 205 | ? cpu_singlethread_map : cpu_populated_map; | 461 | return &per_cpu(gcwq_nr_running, cpu); |
| 462 | else | ||
| 463 | return &unbound_gcwq_nr_running; | ||
| 206 | } | 464 | } |
| 207 | 465 | ||
| 208 | static | 466 | static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, |
| 209 | struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) | 467 | struct workqueue_struct *wq) |
| 210 | { | 468 | { |
| 211 | if (unlikely(is_wq_single_threaded(wq))) | 469 | if (!(wq->flags & WQ_UNBOUND)) { |
| 212 | cpu = singlethread_cpu; | 470 | if (likely(cpu < nr_cpu_ids)) { |
| 213 | return per_cpu_ptr(wq->cpu_wq, cpu); | 471 | #ifdef CONFIG_SMP |
| 472 | return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); | ||
| 473 | #else | ||
| 474 | return wq->cpu_wq.single; | ||
| 475 | #endif | ||
| 476 | } | ||
| 477 | } else if (likely(cpu == WORK_CPU_UNBOUND)) | ||
| 478 | return wq->cpu_wq.single; | ||
| 479 | return NULL; | ||
| 480 | } | ||
| 481 | |||
| 482 | static unsigned int work_color_to_flags(int color) | ||
| 483 | { | ||
| 484 | return color << WORK_STRUCT_COLOR_SHIFT; | ||
| 485 | } | ||
| 486 | |||
| 487 | static int get_work_color(struct work_struct *work) | ||
| 488 | { | ||
| 489 | return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & | ||
| 490 | ((1 << WORK_STRUCT_COLOR_BITS) - 1); | ||
| 491 | } | ||
| 492 | |||
| 493 | static int work_next_color(int color) | ||
| 494 | { | ||
| 495 | return (color + 1) % WORK_NR_COLORS; | ||
| 214 | } | 496 | } |
| 215 | 497 | ||
| 216 | /* | 498 | /* |
| 217 | * Set the workqueue on which a work item is to be run | 499 | * A work's data points to the cwq with WORK_STRUCT_CWQ set while the |
| 218 | * - Must *only* be called if the pending flag is set | 500 | * work is on queue. Once execution starts, WORK_STRUCT_CWQ is |
| 501 | * cleared and the work data contains the cpu number it was last on. | ||
| 502 | * | ||
| 503 | * set_work_{cwq|cpu}() and clear_work_data() can be used to set the | ||
| 504 | * cwq, cpu or clear work->data. These functions should only be | ||
| 505 | * called while the work is owned - ie. while the PENDING bit is set. | ||
| 506 | * | ||
| 507 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq | ||
| 508 | * corresponding to a work. gcwq is available once the work has been | ||
| 509 | * queued anywhere after initialization. cwq is available only from | ||
| 510 | * queueing until execution starts. | ||
| 219 | */ | 511 | */ |
| 220 | static inline void set_wq_data(struct work_struct *work, | 512 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
| 221 | struct cpu_workqueue_struct *cwq) | 513 | unsigned long flags) |
| 222 | { | 514 | { |
| 223 | unsigned long new; | ||
| 224 | |||
| 225 | BUG_ON(!work_pending(work)); | 515 | BUG_ON(!work_pending(work)); |
| 516 | atomic_long_set(&work->data, data | flags | work_static(work)); | ||
| 517 | } | ||
| 518 | |||
| 519 | static void set_work_cwq(struct work_struct *work, | ||
| 520 | struct cpu_workqueue_struct *cwq, | ||
| 521 | unsigned long extra_flags) | ||
| 522 | { | ||
| 523 | set_work_data(work, (unsigned long)cwq, | ||
| 524 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | ||
| 525 | } | ||
| 526 | |||
| 527 | static void set_work_cpu(struct work_struct *work, unsigned int cpu) | ||
| 528 | { | ||
| 529 | set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); | ||
| 530 | } | ||
| 531 | |||
| 532 | static void clear_work_data(struct work_struct *work) | ||
| 533 | { | ||
| 534 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | ||
| 535 | } | ||
| 536 | |||
| 537 | static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) | ||
| 538 | { | ||
| 539 | unsigned long data = atomic_long_read(&work->data); | ||
| 540 | |||
| 541 | if (data & WORK_STRUCT_CWQ) | ||
| 542 | return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); | ||
| 543 | else | ||
| 544 | return NULL; | ||
| 545 | } | ||
| 546 | |||
| 547 | static struct global_cwq *get_work_gcwq(struct work_struct *work) | ||
| 548 | { | ||
| 549 | unsigned long data = atomic_long_read(&work->data); | ||
| 550 | unsigned int cpu; | ||
| 551 | |||
| 552 | if (data & WORK_STRUCT_CWQ) | ||
| 553 | return ((struct cpu_workqueue_struct *) | ||
| 554 | (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; | ||
| 555 | |||
| 556 | cpu = data >> WORK_STRUCT_FLAG_BITS; | ||
| 557 | if (cpu == WORK_CPU_NONE) | ||
| 558 | return NULL; | ||
| 559 | |||
| 560 | BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); | ||
| 561 | return get_gcwq(cpu); | ||
| 562 | } | ||
| 563 | |||
| 564 | /* | ||
| 565 | * Policy functions. These define the policies on how the global | ||
| 566 | * worker pool is managed. Unless noted otherwise, these functions | ||
| 567 | * assume that they're being called with gcwq->lock held. | ||
| 568 | */ | ||
| 569 | |||
| 570 | static bool __need_more_worker(struct global_cwq *gcwq) | ||
| 571 | { | ||
| 572 | return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || | ||
| 573 | gcwq->flags & GCWQ_HIGHPRI_PENDING; | ||
| 574 | } | ||
| 575 | |||
| 576 | /* | ||
| 577 | * Need to wake up a worker? Called from anything but currently | ||
| 578 | * running workers. | ||
| 579 | */ | ||
| 580 | static bool need_more_worker(struct global_cwq *gcwq) | ||
| 581 | { | ||
| 582 | return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); | ||
| 583 | } | ||
| 584 | |||
| 585 | /* Can I start working? Called from busy but !running workers. */ | ||
| 586 | static bool may_start_working(struct global_cwq *gcwq) | ||
| 587 | { | ||
| 588 | return gcwq->nr_idle; | ||
| 589 | } | ||
| 590 | |||
| 591 | /* Do I need to keep working? Called from currently running workers. */ | ||
| 592 | static bool keep_working(struct global_cwq *gcwq) | ||
| 593 | { | ||
| 594 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
| 595 | |||
| 596 | return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; | ||
| 597 | } | ||
| 598 | |||
| 599 | /* Do we need a new worker? Called from manager. */ | ||
| 600 | static bool need_to_create_worker(struct global_cwq *gcwq) | ||
| 601 | { | ||
| 602 | return need_more_worker(gcwq) && !may_start_working(gcwq); | ||
| 603 | } | ||
| 604 | |||
| 605 | /* Do I need to be the manager? */ | ||
| 606 | static bool need_to_manage_workers(struct global_cwq *gcwq) | ||
| 607 | { | ||
| 608 | return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; | ||
| 609 | } | ||
| 610 | |||
| 611 | /* Do we have too many workers and should some go away? */ | ||
| 612 | static bool too_many_workers(struct global_cwq *gcwq) | ||
| 613 | { | ||
| 614 | bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; | ||
| 615 | int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ | ||
| 616 | int nr_busy = gcwq->nr_workers - nr_idle; | ||
| 226 | 617 | ||
| 227 | new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); | 618 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
| 228 | new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); | ||
| 229 | atomic_long_set(&work->data, new); | ||
| 230 | } | 619 | } |
| 231 | 620 | ||
| 232 | /* | 621 | /* |
| 233 | * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. | 622 | * Wake up functions. |
| 234 | */ | 623 | */ |
| 235 | static inline void clear_wq_data(struct work_struct *work) | 624 | |
| 625 | /* Return the first worker. Safe with preemption disabled */ | ||
| 626 | static struct worker *first_worker(struct global_cwq *gcwq) | ||
| 236 | { | 627 | { |
| 237 | unsigned long flags = *work_data_bits(work) & | 628 | if (unlikely(list_empty(&gcwq->idle_list))) |
| 238 | (1UL << WORK_STRUCT_STATIC); | 629 | return NULL; |
| 239 | atomic_long_set(&work->data, flags); | 630 | |
| 631 | return list_first_entry(&gcwq->idle_list, struct worker, entry); | ||
| 240 | } | 632 | } |
| 241 | 633 | ||
| 242 | static inline | 634 | /** |
| 243 | struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) | 635 | * wake_up_worker - wake up an idle worker |
| 636 | * @gcwq: gcwq to wake worker for | ||
| 637 | * | ||
| 638 | * Wake up the first idle worker of @gcwq. | ||
| 639 | * | ||
| 640 | * CONTEXT: | ||
| 641 | * spin_lock_irq(gcwq->lock). | ||
| 642 | */ | ||
| 643 | static void wake_up_worker(struct global_cwq *gcwq) | ||
| 244 | { | 644 | { |
| 245 | return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); | 645 | struct worker *worker = first_worker(gcwq); |
| 646 | |||
| 647 | if (likely(worker)) | ||
| 648 | wake_up_process(worker->task); | ||
| 649 | } | ||
| 650 | |||
| 651 | /** | ||
| 652 | * wq_worker_waking_up - a worker is waking up | ||
| 653 | * @task: task waking up | ||
| 654 | * @cpu: CPU @task is waking up to | ||
| 655 | * | ||
| 656 | * This function is called during try_to_wake_up() when a worker is | ||
| 657 | * being awoken. | ||
| 658 | * | ||
| 659 | * CONTEXT: | ||
| 660 | * spin_lock_irq(rq->lock) | ||
| 661 | */ | ||
| 662 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) | ||
| 663 | { | ||
| 664 | struct worker *worker = kthread_data(task); | ||
| 665 | |||
| 666 | if (likely(!(worker->flags & WORKER_NOT_RUNNING))) | ||
| 667 | atomic_inc(get_gcwq_nr_running(cpu)); | ||
| 668 | } | ||
| 669 | |||
| 670 | /** | ||
| 671 | * wq_worker_sleeping - a worker is going to sleep | ||
| 672 | * @task: task going to sleep | ||
| 673 | * @cpu: CPU in question, must be the current CPU number | ||
| 674 | * | ||
| 675 | * This function is called during schedule() when a busy worker is | ||
| 676 | * going to sleep. Worker on the same cpu can be woken up by | ||
| 677 | * returning pointer to its task. | ||
| 678 | * | ||
| 679 | * CONTEXT: | ||
| 680 | * spin_lock_irq(rq->lock) | ||
| 681 | * | ||
| 682 | * RETURNS: | ||
| 683 | * Worker task on @cpu to wake up, %NULL if none. | ||
| 684 | */ | ||
| 685 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
| 686 | unsigned int cpu) | ||
| 687 | { | ||
| 688 | struct worker *worker = kthread_data(task), *to_wakeup = NULL; | ||
| 689 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 690 | atomic_t *nr_running = get_gcwq_nr_running(cpu); | ||
| 691 | |||
| 692 | if (unlikely(worker->flags & WORKER_NOT_RUNNING)) | ||
| 693 | return NULL; | ||
| 694 | |||
| 695 | /* this can only happen on the local cpu */ | ||
| 696 | BUG_ON(cpu != raw_smp_processor_id()); | ||
| 697 | |||
| 698 | /* | ||
| 699 | * The counterpart of the following dec_and_test, implied mb, | ||
| 700 | * worklist not empty test sequence is in insert_work(). | ||
| 701 | * Please read comment there. | ||
| 702 | * | ||
| 703 | * NOT_RUNNING is clear. This means that trustee is not in | ||
| 704 | * charge and we're running on the local cpu w/ rq lock held | ||
| 705 | * and preemption disabled, which in turn means that none else | ||
| 706 | * could be manipulating idle_list, so dereferencing idle_list | ||
| 707 | * without gcwq lock is safe. | ||
| 708 | */ | ||
| 709 | if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) | ||
| 710 | to_wakeup = first_worker(gcwq); | ||
| 711 | return to_wakeup ? to_wakeup->task : NULL; | ||
| 712 | } | ||
| 713 | |||
| 714 | /** | ||
| 715 | * worker_set_flags - set worker flags and adjust nr_running accordingly | ||
| 716 | * @worker: self | ||
| 717 | * @flags: flags to set | ||
| 718 | * @wakeup: wakeup an idle worker if necessary | ||
| 719 | * | ||
| 720 | * Set @flags in @worker->flags and adjust nr_running accordingly. If | ||
| 721 | * nr_running becomes zero and @wakeup is %true, an idle worker is | ||
| 722 | * woken up. | ||
| 723 | * | ||
| 724 | * CONTEXT: | ||
| 725 | * spin_lock_irq(gcwq->lock) | ||
| 726 | */ | ||
| 727 | static inline void worker_set_flags(struct worker *worker, unsigned int flags, | ||
| 728 | bool wakeup) | ||
| 729 | { | ||
| 730 | struct global_cwq *gcwq = worker->gcwq; | ||
| 731 | |||
| 732 | WARN_ON_ONCE(worker->task != current); | ||
| 733 | |||
| 734 | /* | ||
| 735 | * If transitioning into NOT_RUNNING, adjust nr_running and | ||
| 736 | * wake up an idle worker as necessary if requested by | ||
| 737 | * @wakeup. | ||
| 738 | */ | ||
| 739 | if ((flags & WORKER_NOT_RUNNING) && | ||
| 740 | !(worker->flags & WORKER_NOT_RUNNING)) { | ||
| 741 | atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); | ||
| 742 | |||
| 743 | if (wakeup) { | ||
| 744 | if (atomic_dec_and_test(nr_running) && | ||
| 745 | !list_empty(&gcwq->worklist)) | ||
| 746 | wake_up_worker(gcwq); | ||
| 747 | } else | ||
| 748 | atomic_dec(nr_running); | ||
| 749 | } | ||
| 750 | |||
| 751 | worker->flags |= flags; | ||
| 246 | } | 752 | } |
| 247 | 753 | ||
| 754 | /** | ||
| 755 | * worker_clr_flags - clear worker flags and adjust nr_running accordingly | ||
| 756 | * @worker: self | ||
| 757 | * @flags: flags to clear | ||
| 758 | * | ||
| 759 | * Clear @flags in @worker->flags and adjust nr_running accordingly. | ||
| 760 | * | ||
| 761 | * CONTEXT: | ||
| 762 | * spin_lock_irq(gcwq->lock) | ||
| 763 | */ | ||
| 764 | static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | ||
| 765 | { | ||
| 766 | struct global_cwq *gcwq = worker->gcwq; | ||
| 767 | unsigned int oflags = worker->flags; | ||
| 768 | |||
| 769 | WARN_ON_ONCE(worker->task != current); | ||
| 770 | |||
| 771 | worker->flags &= ~flags; | ||
| 772 | |||
| 773 | /* if transitioning out of NOT_RUNNING, increment nr_running */ | ||
| 774 | if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) | ||
| 775 | if (!(worker->flags & WORKER_NOT_RUNNING)) | ||
| 776 | atomic_inc(get_gcwq_nr_running(gcwq->cpu)); | ||
| 777 | } | ||
| 778 | |||
| 779 | /** | ||
| 780 | * busy_worker_head - return the busy hash head for a work | ||
| 781 | * @gcwq: gcwq of interest | ||
| 782 | * @work: work to be hashed | ||
| 783 | * | ||
| 784 | * Return hash head of @gcwq for @work. | ||
| 785 | * | ||
| 786 | * CONTEXT: | ||
| 787 | * spin_lock_irq(gcwq->lock). | ||
| 788 | * | ||
| 789 | * RETURNS: | ||
| 790 | * Pointer to the hash head. | ||
| 791 | */ | ||
| 792 | static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, | ||
| 793 | struct work_struct *work) | ||
| 794 | { | ||
| 795 | const int base_shift = ilog2(sizeof(struct work_struct)); | ||
| 796 | unsigned long v = (unsigned long)work; | ||
| 797 | |||
| 798 | /* simple shift and fold hash, do we need something better? */ | ||
| 799 | v >>= base_shift; | ||
| 800 | v += v >> BUSY_WORKER_HASH_ORDER; | ||
| 801 | v &= BUSY_WORKER_HASH_MASK; | ||
| 802 | |||
| 803 | return &gcwq->busy_hash[v]; | ||
| 804 | } | ||
| 805 | |||
| 806 | /** | ||
| 807 | * __find_worker_executing_work - find worker which is executing a work | ||
| 808 | * @gcwq: gcwq of interest | ||
| 809 | * @bwh: hash head as returned by busy_worker_head() | ||
| 810 | * @work: work to find worker for | ||
| 811 | * | ||
| 812 | * Find a worker which is executing @work on @gcwq. @bwh should be | ||
| 813 | * the hash head obtained by calling busy_worker_head() with the same | ||
| 814 | * work. | ||
| 815 | * | ||
| 816 | * CONTEXT: | ||
| 817 | * spin_lock_irq(gcwq->lock). | ||
| 818 | * | ||
| 819 | * RETURNS: | ||
| 820 | * Pointer to worker which is executing @work if found, NULL | ||
| 821 | * otherwise. | ||
| 822 | */ | ||
| 823 | static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, | ||
| 824 | struct hlist_head *bwh, | ||
| 825 | struct work_struct *work) | ||
| 826 | { | ||
| 827 | struct worker *worker; | ||
| 828 | struct hlist_node *tmp; | ||
| 829 | |||
| 830 | hlist_for_each_entry(worker, tmp, bwh, hentry) | ||
| 831 | if (worker->current_work == work) | ||
| 832 | return worker; | ||
| 833 | return NULL; | ||
| 834 | } | ||
| 835 | |||
| 836 | /** | ||
| 837 | * find_worker_executing_work - find worker which is executing a work | ||
| 838 | * @gcwq: gcwq of interest | ||
| 839 | * @work: work to find worker for | ||
| 840 | * | ||
| 841 | * Find a worker which is executing @work on @gcwq. This function is | ||
| 842 | * identical to __find_worker_executing_work() except that this | ||
| 843 | * function calculates @bwh itself. | ||
| 844 | * | ||
| 845 | * CONTEXT: | ||
| 846 | * spin_lock_irq(gcwq->lock). | ||
| 847 | * | ||
| 848 | * RETURNS: | ||
| 849 | * Pointer to worker which is executing @work if found, NULL | ||
| 850 | * otherwise. | ||
| 851 | */ | ||
| 852 | static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | ||
| 853 | struct work_struct *work) | ||
| 854 | { | ||
| 855 | return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), | ||
| 856 | work); | ||
| 857 | } | ||
| 858 | |||
| 859 | /** | ||
| 860 | * gcwq_determine_ins_pos - find insertion position | ||
| 861 | * @gcwq: gcwq of interest | ||
| 862 | * @cwq: cwq a work is being queued for | ||
| 863 | * | ||
| 864 | * A work for @cwq is about to be queued on @gcwq, determine insertion | ||
| 865 | * position for the work. If @cwq is for HIGHPRI wq, the work is | ||
| 866 | * queued at the head of the queue but in FIFO order with respect to | ||
| 867 | * other HIGHPRI works; otherwise, at the end of the queue. This | ||
| 868 | * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that | ||
| 869 | * there are HIGHPRI works pending. | ||
| 870 | * | ||
| 871 | * CONTEXT: | ||
| 872 | * spin_lock_irq(gcwq->lock). | ||
| 873 | * | ||
| 874 | * RETURNS: | ||
| 875 | * Pointer to inserstion position. | ||
| 876 | */ | ||
| 877 | static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, | ||
| 878 | struct cpu_workqueue_struct *cwq) | ||
| 879 | { | ||
| 880 | struct work_struct *twork; | ||
| 881 | |||
| 882 | if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) | ||
| 883 | return &gcwq->worklist; | ||
| 884 | |||
| 885 | list_for_each_entry(twork, &gcwq->worklist, entry) { | ||
| 886 | struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); | ||
| 887 | |||
| 888 | if (!(tcwq->wq->flags & WQ_HIGHPRI)) | ||
| 889 | break; | ||
| 890 | } | ||
| 891 | |||
| 892 | gcwq->flags |= GCWQ_HIGHPRI_PENDING; | ||
| 893 | return &twork->entry; | ||
| 894 | } | ||
| 895 | |||
| 896 | /** | ||
| 897 | * insert_work - insert a work into gcwq | ||
| 898 | * @cwq: cwq @work belongs to | ||
| 899 | * @work: work to insert | ||
| 900 | * @head: insertion point | ||
| 901 | * @extra_flags: extra WORK_STRUCT_* flags to set | ||
| 902 | * | ||
| 903 | * Insert @work which belongs to @cwq into @gcwq after @head. | ||
| 904 | * @extra_flags is or'd to work_struct flags. | ||
| 905 | * | ||
| 906 | * CONTEXT: | ||
| 907 | * spin_lock_irq(gcwq->lock). | ||
| 908 | */ | ||
| 248 | static void insert_work(struct cpu_workqueue_struct *cwq, | 909 | static void insert_work(struct cpu_workqueue_struct *cwq, |
| 249 | struct work_struct *work, struct list_head *head) | 910 | struct work_struct *work, struct list_head *head, |
| 911 | unsigned int extra_flags) | ||
| 250 | { | 912 | { |
| 251 | trace_workqueue_insertion(cwq->thread, work); | 913 | struct global_cwq *gcwq = cwq->gcwq; |
| 914 | |||
| 915 | /* we own @work, set data and link */ | ||
| 916 | set_work_cwq(work, cwq, extra_flags); | ||
| 252 | 917 | ||
| 253 | set_wq_data(work, cwq); | ||
| 254 | /* | 918 | /* |
| 255 | * Ensure that we get the right work->data if we see the | 919 | * Ensure that we get the right work->data if we see the |
| 256 | * result of list_add() below, see try_to_grab_pending(). | 920 | * result of list_add() below, see try_to_grab_pending(). |
| 257 | */ | 921 | */ |
| 258 | smp_wmb(); | 922 | smp_wmb(); |
| 923 | |||
| 259 | list_add_tail(&work->entry, head); | 924 | list_add_tail(&work->entry, head); |
| 260 | wake_up(&cwq->more_work); | 925 | |
| 926 | /* | ||
| 927 | * Ensure either worker_sched_deactivated() sees the above | ||
| 928 | * list_add_tail() or we see zero nr_running to avoid workers | ||
| 929 | * lying around lazily while there are works to be processed. | ||
| 930 | */ | ||
| 931 | smp_mb(); | ||
| 932 | |||
| 933 | if (__need_more_worker(gcwq)) | ||
| 934 | wake_up_worker(gcwq); | ||
| 261 | } | 935 | } |
| 262 | 936 | ||
| 263 | static void __queue_work(struct cpu_workqueue_struct *cwq, | 937 | static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, |
| 264 | struct work_struct *work) | 938 | struct work_struct *work) |
| 265 | { | 939 | { |
| 940 | struct global_cwq *gcwq; | ||
| 941 | struct cpu_workqueue_struct *cwq; | ||
| 942 | struct list_head *worklist; | ||
| 266 | unsigned long flags; | 943 | unsigned long flags; |
| 267 | 944 | ||
| 268 | debug_work_activate(work); | 945 | debug_work_activate(work); |
| 269 | spin_lock_irqsave(&cwq->lock, flags); | 946 | |
| 270 | insert_work(cwq, work, &cwq->worklist); | 947 | /* determine gcwq to use */ |
| 271 | spin_unlock_irqrestore(&cwq->lock, flags); | 948 | if (!(wq->flags & WQ_UNBOUND)) { |
| 949 | struct global_cwq *last_gcwq; | ||
| 950 | |||
| 951 | if (unlikely(cpu == WORK_CPU_UNBOUND)) | ||
| 952 | cpu = raw_smp_processor_id(); | ||
| 953 | |||
| 954 | /* | ||
| 955 | * It's multi cpu. If @wq is non-reentrant and @work | ||
| 956 | * was previously on a different cpu, it might still | ||
| 957 | * be running there, in which case the work needs to | ||
| 958 | * be queued on that cpu to guarantee non-reentrance. | ||
| 959 | */ | ||
| 960 | gcwq = get_gcwq(cpu); | ||
| 961 | if (wq->flags & WQ_NON_REENTRANT && | ||
| 962 | (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { | ||
| 963 | struct worker *worker; | ||
| 964 | |||
| 965 | spin_lock_irqsave(&last_gcwq->lock, flags); | ||
| 966 | |||
| 967 | worker = find_worker_executing_work(last_gcwq, work); | ||
| 968 | |||
| 969 | if (worker && worker->current_cwq->wq == wq) | ||
| 970 | gcwq = last_gcwq; | ||
| 971 | else { | ||
| 972 | /* meh... not running there, queue here */ | ||
| 973 | spin_unlock_irqrestore(&last_gcwq->lock, flags); | ||
| 974 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 975 | } | ||
| 976 | } else | ||
| 977 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 978 | } else { | ||
| 979 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | ||
| 980 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 981 | } | ||
| 982 | |||
| 983 | /* gcwq determined, get cwq and queue */ | ||
| 984 | cwq = get_cwq(gcwq->cpu, wq); | ||
| 985 | |||
| 986 | BUG_ON(!list_empty(&work->entry)); | ||
| 987 | |||
| 988 | cwq->nr_in_flight[cwq->work_color]++; | ||
| 989 | |||
| 990 | if (likely(cwq->nr_active < cwq->max_active)) { | ||
| 991 | cwq->nr_active++; | ||
| 992 | worklist = gcwq_determine_ins_pos(gcwq, cwq); | ||
| 993 | } else | ||
| 994 | worklist = &cwq->delayed_works; | ||
| 995 | |||
| 996 | insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); | ||
| 997 | |||
| 998 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
| 272 | } | 999 | } |
| 273 | 1000 | ||
| 274 | /** | 1001 | /** |
| @@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) | |||
| 308 | { | 1035 | { |
| 309 | int ret = 0; | 1036 | int ret = 0; |
| 310 | 1037 | ||
| 311 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1038 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
| 312 | BUG_ON(!list_empty(&work->entry)); | 1039 | __queue_work(cpu, wq, work); |
| 313 | __queue_work(wq_per_cpu(wq, cpu), work); | ||
| 314 | ret = 1; | 1040 | ret = 1; |
| 315 | } | 1041 | } |
| 316 | return ret; | 1042 | return ret; |
| @@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); | |||
| 320 | static void delayed_work_timer_fn(unsigned long __data) | 1046 | static void delayed_work_timer_fn(unsigned long __data) |
| 321 | { | 1047 | { |
| 322 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1048 | struct delayed_work *dwork = (struct delayed_work *)__data; |
| 323 | struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); | 1049 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); |
| 324 | struct workqueue_struct *wq = cwq->wq; | ||
| 325 | 1050 | ||
| 326 | __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); | 1051 | __queue_work(smp_processor_id(), cwq->wq, &dwork->work); |
| 327 | } | 1052 | } |
| 328 | 1053 | ||
| 329 | /** | 1054 | /** |
| @@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
| 360 | struct timer_list *timer = &dwork->timer; | 1085 | struct timer_list *timer = &dwork->timer; |
| 361 | struct work_struct *work = &dwork->work; | 1086 | struct work_struct *work = &dwork->work; |
| 362 | 1087 | ||
| 363 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | 1088 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
| 1089 | unsigned int lcpu; | ||
| 1090 | |||
| 364 | BUG_ON(timer_pending(timer)); | 1091 | BUG_ON(timer_pending(timer)); |
| 365 | BUG_ON(!list_empty(&work->entry)); | 1092 | BUG_ON(!list_empty(&work->entry)); |
| 366 | 1093 | ||
| 367 | timer_stats_timer_set_start_info(&dwork->timer); | 1094 | timer_stats_timer_set_start_info(&dwork->timer); |
| 368 | 1095 | ||
| 369 | /* This stores cwq for the moment, for the timer_fn */ | 1096 | /* |
| 370 | set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); | 1097 | * This stores cwq for the moment, for the timer_fn. |
| 1098 | * Note that the work's gcwq is preserved to allow | ||
| 1099 | * reentrance detection for delayed works. | ||
| 1100 | */ | ||
| 1101 | if (!(wq->flags & WQ_UNBOUND)) { | ||
| 1102 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 1103 | |||
| 1104 | if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) | ||
| 1105 | lcpu = gcwq->cpu; | ||
| 1106 | else | ||
| 1107 | lcpu = raw_smp_processor_id(); | ||
| 1108 | } else | ||
| 1109 | lcpu = WORK_CPU_UNBOUND; | ||
| 1110 | |||
| 1111 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
| 1112 | |||
| 371 | timer->expires = jiffies + delay; | 1113 | timer->expires = jiffies + delay; |
| 372 | timer->data = (unsigned long)dwork; | 1114 | timer->data = (unsigned long)dwork; |
| 373 | timer->function = delayed_work_timer_fn; | 1115 | timer->function = delayed_work_timer_fn; |
| @@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | |||
| 382 | } | 1124 | } |
| 383 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1125 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); |
| 384 | 1126 | ||
| 385 | static void run_workqueue(struct cpu_workqueue_struct *cwq) | 1127 | /** |
| 1128 | * worker_enter_idle - enter idle state | ||
| 1129 | * @worker: worker which is entering idle state | ||
| 1130 | * | ||
| 1131 | * @worker is entering idle state. Update stats and idle timer if | ||
| 1132 | * necessary. | ||
| 1133 | * | ||
| 1134 | * LOCKING: | ||
| 1135 | * spin_lock_irq(gcwq->lock). | ||
| 1136 | */ | ||
| 1137 | static void worker_enter_idle(struct worker *worker) | ||
| 386 | { | 1138 | { |
| 387 | spin_lock_irq(&cwq->lock); | 1139 | struct global_cwq *gcwq = worker->gcwq; |
| 388 | while (!list_empty(&cwq->worklist)) { | 1140 | |
| 389 | struct work_struct *work = list_entry(cwq->worklist.next, | 1141 | BUG_ON(worker->flags & WORKER_IDLE); |
| 390 | struct work_struct, entry); | 1142 | BUG_ON(!list_empty(&worker->entry) && |
| 391 | work_func_t f = work->func; | 1143 | (worker->hentry.next || worker->hentry.pprev)); |
| 392 | #ifdef CONFIG_LOCKDEP | 1144 | |
| 1145 | /* can't use worker_set_flags(), also called from start_worker() */ | ||
| 1146 | worker->flags |= WORKER_IDLE; | ||
| 1147 | gcwq->nr_idle++; | ||
| 1148 | worker->last_active = jiffies; | ||
| 1149 | |||
| 1150 | /* idle_list is LIFO */ | ||
| 1151 | list_add(&worker->entry, &gcwq->idle_list); | ||
| 1152 | |||
| 1153 | if (likely(!(worker->flags & WORKER_ROGUE))) { | ||
| 1154 | if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) | ||
| 1155 | mod_timer(&gcwq->idle_timer, | ||
| 1156 | jiffies + IDLE_WORKER_TIMEOUT); | ||
| 1157 | } else | ||
| 1158 | wake_up_all(&gcwq->trustee_wait); | ||
| 1159 | |||
| 1160 | /* sanity check nr_running */ | ||
| 1161 | WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && | ||
| 1162 | atomic_read(get_gcwq_nr_running(gcwq->cpu))); | ||
| 1163 | } | ||
| 1164 | |||
| 1165 | /** | ||
| 1166 | * worker_leave_idle - leave idle state | ||
| 1167 | * @worker: worker which is leaving idle state | ||
| 1168 | * | ||
| 1169 | * @worker is leaving idle state. Update stats. | ||
| 1170 | * | ||
| 1171 | * LOCKING: | ||
| 1172 | * spin_lock_irq(gcwq->lock). | ||
| 1173 | */ | ||
| 1174 | static void worker_leave_idle(struct worker *worker) | ||
| 1175 | { | ||
| 1176 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1177 | |||
| 1178 | BUG_ON(!(worker->flags & WORKER_IDLE)); | ||
| 1179 | worker_clr_flags(worker, WORKER_IDLE); | ||
| 1180 | gcwq->nr_idle--; | ||
| 1181 | list_del_init(&worker->entry); | ||
| 1182 | } | ||
| 1183 | |||
| 1184 | /** | ||
| 1185 | * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq | ||
| 1186 | * @worker: self | ||
| 1187 | * | ||
| 1188 | * Works which are scheduled while the cpu is online must at least be | ||
| 1189 | * scheduled to a worker which is bound to the cpu so that if they are | ||
| 1190 | * flushed from cpu callbacks while cpu is going down, they are | ||
| 1191 | * guaranteed to execute on the cpu. | ||
| 1192 | * | ||
| 1193 | * This function is to be used by rogue workers and rescuers to bind | ||
| 1194 | * themselves to the target cpu and may race with cpu going down or | ||
| 1195 | * coming online. kthread_bind() can't be used because it may put the | ||
| 1196 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | ||
| 1197 | * verbatim as it's best effort and blocking and gcwq may be | ||
| 1198 | * [dis]associated in the meantime. | ||
| 1199 | * | ||
| 1200 | * This function tries set_cpus_allowed() and locks gcwq and verifies | ||
| 1201 | * the binding against GCWQ_DISASSOCIATED which is set during | ||
| 1202 | * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters | ||
| 1203 | * idle state or fetches works without dropping lock, it can guarantee | ||
| 1204 | * the scheduling requirement described in the first paragraph. | ||
| 1205 | * | ||
| 1206 | * CONTEXT: | ||
| 1207 | * Might sleep. Called without any lock but returns with gcwq->lock | ||
| 1208 | * held. | ||
| 1209 | * | ||
| 1210 | * RETURNS: | ||
| 1211 | * %true if the associated gcwq is online (@worker is successfully | ||
| 1212 | * bound), %false if offline. | ||
| 1213 | */ | ||
| 1214 | static bool worker_maybe_bind_and_lock(struct worker *worker) | ||
| 1215 | { | ||
| 1216 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1217 | struct task_struct *task = worker->task; | ||
| 1218 | |||
| 1219 | while (true) { | ||
| 393 | /* | 1220 | /* |
| 394 | * It is permissible to free the struct work_struct | 1221 | * The following call may fail, succeed or succeed |
| 395 | * from inside the function that is called from it, | 1222 | * without actually migrating the task to the cpu if |
| 396 | * this we need to take into account for lockdep too. | 1223 | * it races with cpu hotunplug operation. Verify |
| 397 | * To avoid bogus "held lock freed" warnings as well | 1224 | * against GCWQ_DISASSOCIATED. |
| 398 | * as problems when looking into work->lockdep_map, | ||
| 399 | * make a copy and use that here. | ||
| 400 | */ | 1225 | */ |
| 401 | struct lockdep_map lockdep_map = work->lockdep_map; | 1226 | if (!(gcwq->flags & GCWQ_DISASSOCIATED)) |
| 402 | #endif | 1227 | set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); |
| 403 | trace_workqueue_execution(cwq->thread, work); | 1228 | |
| 404 | debug_work_deactivate(work); | 1229 | spin_lock_irq(&gcwq->lock); |
| 405 | cwq->current_work = work; | 1230 | if (gcwq->flags & GCWQ_DISASSOCIATED) |
| 406 | list_del_init(cwq->worklist.next); | 1231 | return false; |
| 407 | spin_unlock_irq(&cwq->lock); | 1232 | if (task_cpu(task) == gcwq->cpu && |
| 408 | 1233 | cpumask_equal(¤t->cpus_allowed, | |
| 409 | BUG_ON(get_wq_data(work) != cwq); | 1234 | get_cpu_mask(gcwq->cpu))) |
| 410 | work_clear_pending(work); | 1235 | return true; |
| 411 | lock_map_acquire(&cwq->wq->lockdep_map); | 1236 | spin_unlock_irq(&gcwq->lock); |
| 412 | lock_map_acquire(&lockdep_map); | 1237 | |
| 413 | f(work); | 1238 | /* CPU has come up inbetween, retry migration */ |
| 414 | lock_map_release(&lockdep_map); | 1239 | cpu_relax(); |
| 415 | lock_map_release(&cwq->wq->lockdep_map); | 1240 | } |
| 416 | 1241 | } | |
| 417 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 1242 | |
| 418 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | 1243 | /* |
| 419 | "%s/0x%08x/%d\n", | 1244 | * Function for worker->rebind_work used to rebind rogue busy workers |
| 420 | current->comm, preempt_count(), | 1245 | * to the associated cpu which is coming back online. This is |
| 421 | task_pid_nr(current)); | 1246 | * scheduled by cpu up but can race with other cpu hotplug operations |
| 422 | printk(KERN_ERR " last function: "); | 1247 | * and may be executed twice without intervening cpu down. |
| 423 | print_symbol("%s\n", (unsigned long)f); | 1248 | */ |
| 424 | debug_show_held_locks(current); | 1249 | static void worker_rebind_fn(struct work_struct *work) |
| 425 | dump_stack(); | 1250 | { |
| 1251 | struct worker *worker = container_of(work, struct worker, rebind_work); | ||
| 1252 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1253 | |||
| 1254 | if (worker_maybe_bind_and_lock(worker)) | ||
| 1255 | worker_clr_flags(worker, WORKER_REBIND); | ||
| 1256 | |||
| 1257 | spin_unlock_irq(&gcwq->lock); | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | static struct worker *alloc_worker(void) | ||
| 1261 | { | ||
| 1262 | struct worker *worker; | ||
| 1263 | |||
| 1264 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | ||
| 1265 | if (worker) { | ||
| 1266 | INIT_LIST_HEAD(&worker->entry); | ||
| 1267 | INIT_LIST_HEAD(&worker->scheduled); | ||
| 1268 | INIT_WORK(&worker->rebind_work, worker_rebind_fn); | ||
| 1269 | /* on creation a worker is in !idle && prep state */ | ||
| 1270 | worker->flags = WORKER_PREP; | ||
| 1271 | } | ||
| 1272 | return worker; | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | /** | ||
| 1276 | * create_worker - create a new workqueue worker | ||
| 1277 | * @gcwq: gcwq the new worker will belong to | ||
| 1278 | * @bind: whether to set affinity to @cpu or not | ||
| 1279 | * | ||
| 1280 | * Create a new worker which is bound to @gcwq. The returned worker | ||
| 1281 | * can be started by calling start_worker() or destroyed using | ||
| 1282 | * destroy_worker(). | ||
| 1283 | * | ||
| 1284 | * CONTEXT: | ||
| 1285 | * Might sleep. Does GFP_KERNEL allocations. | ||
| 1286 | * | ||
| 1287 | * RETURNS: | ||
| 1288 | * Pointer to the newly created worker. | ||
| 1289 | */ | ||
| 1290 | static struct worker *create_worker(struct global_cwq *gcwq, bool bind) | ||
| 1291 | { | ||
| 1292 | bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; | ||
| 1293 | struct worker *worker = NULL; | ||
| 1294 | int id = -1; | ||
| 1295 | |||
| 1296 | spin_lock_irq(&gcwq->lock); | ||
| 1297 | while (ida_get_new(&gcwq->worker_ida, &id)) { | ||
| 1298 | spin_unlock_irq(&gcwq->lock); | ||
| 1299 | if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) | ||
| 1300 | goto fail; | ||
| 1301 | spin_lock_irq(&gcwq->lock); | ||
| 1302 | } | ||
| 1303 | spin_unlock_irq(&gcwq->lock); | ||
| 1304 | |||
| 1305 | worker = alloc_worker(); | ||
| 1306 | if (!worker) | ||
| 1307 | goto fail; | ||
| 1308 | |||
| 1309 | worker->gcwq = gcwq; | ||
| 1310 | worker->id = id; | ||
| 1311 | |||
| 1312 | if (!on_unbound_cpu) | ||
| 1313 | worker->task = kthread_create(worker_thread, worker, | ||
| 1314 | "kworker/%u:%d", gcwq->cpu, id); | ||
| 1315 | else | ||
| 1316 | worker->task = kthread_create(worker_thread, worker, | ||
| 1317 | "kworker/u:%d", id); | ||
| 1318 | if (IS_ERR(worker->task)) | ||
| 1319 | goto fail; | ||
| 1320 | |||
| 1321 | /* | ||
| 1322 | * A rogue worker will become a regular one if CPU comes | ||
| 1323 | * online later on. Make sure every worker has | ||
| 1324 | * PF_THREAD_BOUND set. | ||
| 1325 | */ | ||
| 1326 | if (bind && !on_unbound_cpu) | ||
| 1327 | kthread_bind(worker->task, gcwq->cpu); | ||
| 1328 | else { | ||
| 1329 | worker->task->flags |= PF_THREAD_BOUND; | ||
| 1330 | if (on_unbound_cpu) | ||
| 1331 | worker->flags |= WORKER_UNBOUND; | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | return worker; | ||
| 1335 | fail: | ||
| 1336 | if (id >= 0) { | ||
| 1337 | spin_lock_irq(&gcwq->lock); | ||
| 1338 | ida_remove(&gcwq->worker_ida, id); | ||
| 1339 | spin_unlock_irq(&gcwq->lock); | ||
| 1340 | } | ||
| 1341 | kfree(worker); | ||
| 1342 | return NULL; | ||
| 1343 | } | ||
| 1344 | |||
| 1345 | /** | ||
| 1346 | * start_worker - start a newly created worker | ||
| 1347 | * @worker: worker to start | ||
| 1348 | * | ||
| 1349 | * Make the gcwq aware of @worker and start it. | ||
| 1350 | * | ||
| 1351 | * CONTEXT: | ||
| 1352 | * spin_lock_irq(gcwq->lock). | ||
| 1353 | */ | ||
| 1354 | static void start_worker(struct worker *worker) | ||
| 1355 | { | ||
| 1356 | worker->flags |= WORKER_STARTED; | ||
| 1357 | worker->gcwq->nr_workers++; | ||
| 1358 | worker_enter_idle(worker); | ||
| 1359 | wake_up_process(worker->task); | ||
| 1360 | } | ||
| 1361 | |||
| 1362 | /** | ||
| 1363 | * destroy_worker - destroy a workqueue worker | ||
| 1364 | * @worker: worker to be destroyed | ||
| 1365 | * | ||
| 1366 | * Destroy @worker and adjust @gcwq stats accordingly. | ||
| 1367 | * | ||
| 1368 | * CONTEXT: | ||
| 1369 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
| 1370 | */ | ||
| 1371 | static void destroy_worker(struct worker *worker) | ||
| 1372 | { | ||
| 1373 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1374 | int id = worker->id; | ||
| 1375 | |||
| 1376 | /* sanity check frenzy */ | ||
| 1377 | BUG_ON(worker->current_work); | ||
| 1378 | BUG_ON(!list_empty(&worker->scheduled)); | ||
| 1379 | |||
| 1380 | if (worker->flags & WORKER_STARTED) | ||
| 1381 | gcwq->nr_workers--; | ||
| 1382 | if (worker->flags & WORKER_IDLE) | ||
| 1383 | gcwq->nr_idle--; | ||
| 1384 | |||
| 1385 | list_del_init(&worker->entry); | ||
| 1386 | worker->flags |= WORKER_DIE; | ||
| 1387 | |||
| 1388 | spin_unlock_irq(&gcwq->lock); | ||
| 1389 | |||
| 1390 | kthread_stop(worker->task); | ||
| 1391 | kfree(worker); | ||
| 1392 | |||
| 1393 | spin_lock_irq(&gcwq->lock); | ||
| 1394 | ida_remove(&gcwq->worker_ida, id); | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | static void idle_worker_timeout(unsigned long __gcwq) | ||
| 1398 | { | ||
| 1399 | struct global_cwq *gcwq = (void *)__gcwq; | ||
| 1400 | |||
| 1401 | spin_lock_irq(&gcwq->lock); | ||
| 1402 | |||
| 1403 | if (too_many_workers(gcwq)) { | ||
| 1404 | struct worker *worker; | ||
| 1405 | unsigned long expires; | ||
| 1406 | |||
| 1407 | /* idle_list is kept in LIFO order, check the last one */ | ||
| 1408 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); | ||
| 1409 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
| 1410 | |||
| 1411 | if (time_before(jiffies, expires)) | ||
| 1412 | mod_timer(&gcwq->idle_timer, expires); | ||
| 1413 | else { | ||
| 1414 | /* it's been idle for too long, wake up manager */ | ||
| 1415 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
| 1416 | wake_up_worker(gcwq); | ||
| 1417 | } | ||
| 1418 | } | ||
| 1419 | |||
| 1420 | spin_unlock_irq(&gcwq->lock); | ||
| 1421 | } | ||
| 1422 | |||
| 1423 | static bool send_mayday(struct work_struct *work) | ||
| 1424 | { | ||
| 1425 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
| 1426 | struct workqueue_struct *wq = cwq->wq; | ||
| 1427 | unsigned int cpu; | ||
| 1428 | |||
| 1429 | if (!(wq->flags & WQ_RESCUER)) | ||
| 1430 | return false; | ||
| 1431 | |||
| 1432 | /* mayday mayday mayday */ | ||
| 1433 | cpu = cwq->gcwq->cpu; | ||
| 1434 | /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ | ||
| 1435 | if (cpu == WORK_CPU_UNBOUND) | ||
| 1436 | cpu = 0; | ||
| 1437 | if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) | ||
| 1438 | wake_up_process(wq->rescuer->task); | ||
| 1439 | return true; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | static void gcwq_mayday_timeout(unsigned long __gcwq) | ||
| 1443 | { | ||
| 1444 | struct global_cwq *gcwq = (void *)__gcwq; | ||
| 1445 | struct work_struct *work; | ||
| 1446 | |||
| 1447 | spin_lock_irq(&gcwq->lock); | ||
| 1448 | |||
| 1449 | if (need_to_create_worker(gcwq)) { | ||
| 1450 | /* | ||
| 1451 | * We've been trying to create a new worker but | ||
| 1452 | * haven't been successful. We might be hitting an | ||
| 1453 | * allocation deadlock. Send distress signals to | ||
| 1454 | * rescuers. | ||
| 1455 | */ | ||
| 1456 | list_for_each_entry(work, &gcwq->worklist, entry) | ||
| 1457 | send_mayday(work); | ||
| 1458 | } | ||
| 1459 | |||
| 1460 | spin_unlock_irq(&gcwq->lock); | ||
| 1461 | |||
| 1462 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | /** | ||
| 1466 | * maybe_create_worker - create a new worker if necessary | ||
| 1467 | * @gcwq: gcwq to create a new worker for | ||
| 1468 | * | ||
| 1469 | * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to | ||
| 1470 | * have at least one idle worker on return from this function. If | ||
| 1471 | * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is | ||
| 1472 | * sent to all rescuers with works scheduled on @gcwq to resolve | ||
| 1473 | * possible allocation deadlock. | ||
| 1474 | * | ||
| 1475 | * On return, need_to_create_worker() is guaranteed to be false and | ||
| 1476 | * may_start_working() true. | ||
| 1477 | * | ||
| 1478 | * LOCKING: | ||
| 1479 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 1480 | * multiple times. Does GFP_KERNEL allocations. Called only from | ||
| 1481 | * manager. | ||
| 1482 | * | ||
| 1483 | * RETURNS: | ||
| 1484 | * false if no action was taken and gcwq->lock stayed locked, true | ||
| 1485 | * otherwise. | ||
| 1486 | */ | ||
| 1487 | static bool maybe_create_worker(struct global_cwq *gcwq) | ||
| 1488 | { | ||
| 1489 | if (!need_to_create_worker(gcwq)) | ||
| 1490 | return false; | ||
| 1491 | restart: | ||
| 1492 | spin_unlock_irq(&gcwq->lock); | ||
| 1493 | |||
| 1494 | /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ | ||
| 1495 | mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); | ||
| 1496 | |||
| 1497 | while (true) { | ||
| 1498 | struct worker *worker; | ||
| 1499 | |||
| 1500 | worker = create_worker(gcwq, true); | ||
| 1501 | if (worker) { | ||
| 1502 | del_timer_sync(&gcwq->mayday_timer); | ||
| 1503 | spin_lock_irq(&gcwq->lock); | ||
| 1504 | start_worker(worker); | ||
| 1505 | BUG_ON(need_to_create_worker(gcwq)); | ||
| 1506 | return true; | ||
| 426 | } | 1507 | } |
| 427 | 1508 | ||
| 428 | spin_lock_irq(&cwq->lock); | 1509 | if (!need_to_create_worker(gcwq)) |
| 429 | cwq->current_work = NULL; | 1510 | break; |
| 1511 | |||
| 1512 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 1513 | schedule_timeout(CREATE_COOLDOWN); | ||
| 1514 | |||
| 1515 | if (!need_to_create_worker(gcwq)) | ||
| 1516 | break; | ||
| 430 | } | 1517 | } |
| 431 | spin_unlock_irq(&cwq->lock); | 1518 | |
| 1519 | del_timer_sync(&gcwq->mayday_timer); | ||
| 1520 | spin_lock_irq(&gcwq->lock); | ||
| 1521 | if (need_to_create_worker(gcwq)) | ||
| 1522 | goto restart; | ||
| 1523 | return true; | ||
| 432 | } | 1524 | } |
| 433 | 1525 | ||
| 434 | static int worker_thread(void *__cwq) | 1526 | /** |
| 1527 | * maybe_destroy_worker - destroy workers which have been idle for a while | ||
| 1528 | * @gcwq: gcwq to destroy workers for | ||
| 1529 | * | ||
| 1530 | * Destroy @gcwq workers which have been idle for longer than | ||
| 1531 | * IDLE_WORKER_TIMEOUT. | ||
| 1532 | * | ||
| 1533 | * LOCKING: | ||
| 1534 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 1535 | * multiple times. Called only from manager. | ||
| 1536 | * | ||
| 1537 | * RETURNS: | ||
| 1538 | * false if no action was taken and gcwq->lock stayed locked, true | ||
| 1539 | * otherwise. | ||
| 1540 | */ | ||
| 1541 | static bool maybe_destroy_workers(struct global_cwq *gcwq) | ||
| 435 | { | 1542 | { |
| 436 | struct cpu_workqueue_struct *cwq = __cwq; | 1543 | bool ret = false; |
| 437 | DEFINE_WAIT(wait); | ||
| 438 | 1544 | ||
| 439 | if (cwq->wq->freezeable) | 1545 | while (too_many_workers(gcwq)) { |
| 440 | set_freezable(); | 1546 | struct worker *worker; |
| 1547 | unsigned long expires; | ||
| 441 | 1548 | ||
| 442 | for (;;) { | 1549 | worker = list_entry(gcwq->idle_list.prev, struct worker, entry); |
| 443 | prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); | 1550 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
| 444 | if (!freezing(current) && | ||
| 445 | !kthread_should_stop() && | ||
| 446 | list_empty(&cwq->worklist)) | ||
| 447 | schedule(); | ||
| 448 | finish_wait(&cwq->more_work, &wait); | ||
| 449 | 1551 | ||
| 450 | try_to_freeze(); | 1552 | if (time_before(jiffies, expires)) { |
| 1553 | mod_timer(&gcwq->idle_timer, expires); | ||
| 1554 | break; | ||
| 1555 | } | ||
| 451 | 1556 | ||
| 452 | if (kthread_should_stop()) | 1557 | destroy_worker(worker); |
| 1558 | ret = true; | ||
| 1559 | } | ||
| 1560 | |||
| 1561 | return ret; | ||
| 1562 | } | ||
| 1563 | |||
| 1564 | /** | ||
| 1565 | * manage_workers - manage worker pool | ||
| 1566 | * @worker: self | ||
| 1567 | * | ||
| 1568 | * Assume the manager role and manage gcwq worker pool @worker belongs | ||
| 1569 | * to. At any given time, there can be only zero or one manager per | ||
| 1570 | * gcwq. The exclusion is handled automatically by this function. | ||
| 1571 | * | ||
| 1572 | * The caller can safely start processing works on false return. On | ||
| 1573 | * true return, it's guaranteed that need_to_create_worker() is false | ||
| 1574 | * and may_start_working() is true. | ||
| 1575 | * | ||
| 1576 | * CONTEXT: | ||
| 1577 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 1578 | * multiple times. Does GFP_KERNEL allocations. | ||
| 1579 | * | ||
| 1580 | * RETURNS: | ||
| 1581 | * false if no action was taken and gcwq->lock stayed locked, true if | ||
| 1582 | * some action was taken. | ||
| 1583 | */ | ||
| 1584 | static bool manage_workers(struct worker *worker) | ||
| 1585 | { | ||
| 1586 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1587 | bool ret = false; | ||
| 1588 | |||
| 1589 | if (gcwq->flags & GCWQ_MANAGING_WORKERS) | ||
| 1590 | return ret; | ||
| 1591 | |||
| 1592 | gcwq->flags &= ~GCWQ_MANAGE_WORKERS; | ||
| 1593 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
| 1594 | |||
| 1595 | /* | ||
| 1596 | * Destroy and then create so that may_start_working() is true | ||
| 1597 | * on return. | ||
| 1598 | */ | ||
| 1599 | ret |= maybe_destroy_workers(gcwq); | ||
| 1600 | ret |= maybe_create_worker(gcwq); | ||
| 1601 | |||
| 1602 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
| 1603 | |||
| 1604 | /* | ||
| 1605 | * The trustee might be waiting to take over the manager | ||
| 1606 | * position, tell it we're done. | ||
| 1607 | */ | ||
| 1608 | if (unlikely(gcwq->trustee)) | ||
| 1609 | wake_up_all(&gcwq->trustee_wait); | ||
| 1610 | |||
| 1611 | return ret; | ||
| 1612 | } | ||
| 1613 | |||
| 1614 | /** | ||
| 1615 | * move_linked_works - move linked works to a list | ||
| 1616 | * @work: start of series of works to be scheduled | ||
| 1617 | * @head: target list to append @work to | ||
| 1618 | * @nextp: out paramter for nested worklist walking | ||
| 1619 | * | ||
| 1620 | * Schedule linked works starting from @work to @head. Work series to | ||
| 1621 | * be scheduled starts at @work and includes any consecutive work with | ||
| 1622 | * WORK_STRUCT_LINKED set in its predecessor. | ||
| 1623 | * | ||
| 1624 | * If @nextp is not NULL, it's updated to point to the next work of | ||
| 1625 | * the last scheduled work. This allows move_linked_works() to be | ||
| 1626 | * nested inside outer list_for_each_entry_safe(). | ||
| 1627 | * | ||
| 1628 | * CONTEXT: | ||
| 1629 | * spin_lock_irq(gcwq->lock). | ||
| 1630 | */ | ||
| 1631 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
| 1632 | struct work_struct **nextp) | ||
| 1633 | { | ||
| 1634 | struct work_struct *n; | ||
| 1635 | |||
| 1636 | /* | ||
| 1637 | * Linked worklist will always end before the end of the list, | ||
| 1638 | * use NULL for list head. | ||
| 1639 | */ | ||
| 1640 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
| 1641 | list_move_tail(&work->entry, head); | ||
| 1642 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
| 453 | break; | 1643 | break; |
| 1644 | } | ||
| 1645 | |||
| 1646 | /* | ||
| 1647 | * If we're already inside safe list traversal and have moved | ||
| 1648 | * multiple works to the scheduled queue, the next position | ||
| 1649 | * needs to be updated. | ||
| 1650 | */ | ||
| 1651 | if (nextp) | ||
| 1652 | *nextp = n; | ||
| 1653 | } | ||
| 1654 | |||
| 1655 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
| 1656 | { | ||
| 1657 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
| 1658 | struct work_struct, entry); | ||
| 1659 | struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); | ||
| 1660 | |||
| 1661 | move_linked_works(work, pos, NULL); | ||
| 1662 | cwq->nr_active++; | ||
| 1663 | } | ||
| 454 | 1664 | ||
| 455 | run_workqueue(cwq); | 1665 | /** |
| 1666 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
| 1667 | * @cwq: cwq of interest | ||
| 1668 | * @color: color of work which left the queue | ||
| 1669 | * | ||
| 1670 | * A work either has completed or is removed from pending queue, | ||
| 1671 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
| 1672 | * | ||
| 1673 | * CONTEXT: | ||
| 1674 | * spin_lock_irq(gcwq->lock). | ||
| 1675 | */ | ||
| 1676 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) | ||
| 1677 | { | ||
| 1678 | /* ignore uncolored works */ | ||
| 1679 | if (color == WORK_NO_COLOR) | ||
| 1680 | return; | ||
| 1681 | |||
| 1682 | cwq->nr_in_flight[color]--; | ||
| 1683 | cwq->nr_active--; | ||
| 1684 | |||
| 1685 | if (!list_empty(&cwq->delayed_works)) { | ||
| 1686 | /* one down, submit a delayed one */ | ||
| 1687 | if (cwq->nr_active < cwq->max_active) | ||
| 1688 | cwq_activate_first_delayed(cwq); | ||
| 456 | } | 1689 | } |
| 457 | 1690 | ||
| 458 | return 0; | 1691 | /* is flush in progress and are we at the flushing tip? */ |
| 1692 | if (likely(cwq->flush_color != color)) | ||
| 1693 | return; | ||
| 1694 | |||
| 1695 | /* are there still in-flight works? */ | ||
| 1696 | if (cwq->nr_in_flight[color]) | ||
| 1697 | return; | ||
| 1698 | |||
| 1699 | /* this cwq is done, clear flush_color */ | ||
| 1700 | cwq->flush_color = -1; | ||
| 1701 | |||
| 1702 | /* | ||
| 1703 | * If this was the last cwq, wake up the first flusher. It | ||
| 1704 | * will handle the rest. | ||
| 1705 | */ | ||
| 1706 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
| 1707 | complete(&cwq->wq->first_flusher->done); | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | /** | ||
| 1711 | * process_one_work - process single work | ||
| 1712 | * @worker: self | ||
| 1713 | * @work: work to process | ||
| 1714 | * | ||
| 1715 | * Process @work. This function contains all the logics necessary to | ||
| 1716 | * process a single work including synchronization against and | ||
| 1717 | * interaction with other workers on the same cpu, queueing and | ||
| 1718 | * flushing. As long as context requirement is met, any worker can | ||
| 1719 | * call this function to process a work. | ||
| 1720 | * | ||
| 1721 | * CONTEXT: | ||
| 1722 | * spin_lock_irq(gcwq->lock) which is released and regrabbed. | ||
| 1723 | */ | ||
| 1724 | static void process_one_work(struct worker *worker, struct work_struct *work) | ||
| 1725 | { | ||
| 1726 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
| 1727 | struct global_cwq *gcwq = cwq->gcwq; | ||
| 1728 | struct hlist_head *bwh = busy_worker_head(gcwq, work); | ||
| 1729 | bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; | ||
| 1730 | work_func_t f = work->func; | ||
| 1731 | int work_color; | ||
| 1732 | struct worker *collision; | ||
| 1733 | #ifdef CONFIG_LOCKDEP | ||
| 1734 | /* | ||
| 1735 | * It is permissible to free the struct work_struct from | ||
| 1736 | * inside the function that is called from it, this we need to | ||
| 1737 | * take into account for lockdep too. To avoid bogus "held | ||
| 1738 | * lock freed" warnings as well as problems when looking into | ||
| 1739 | * work->lockdep_map, make a copy and use that here. | ||
| 1740 | */ | ||
| 1741 | struct lockdep_map lockdep_map = work->lockdep_map; | ||
| 1742 | #endif | ||
| 1743 | /* | ||
| 1744 | * A single work shouldn't be executed concurrently by | ||
| 1745 | * multiple workers on a single cpu. Check whether anyone is | ||
| 1746 | * already processing the work. If so, defer the work to the | ||
| 1747 | * currently executing one. | ||
| 1748 | */ | ||
| 1749 | collision = __find_worker_executing_work(gcwq, bwh, work); | ||
| 1750 | if (unlikely(collision)) { | ||
| 1751 | move_linked_works(work, &collision->scheduled, NULL); | ||
| 1752 | return; | ||
| 1753 | } | ||
| 1754 | |||
| 1755 | /* claim and process */ | ||
| 1756 | debug_work_deactivate(work); | ||
| 1757 | hlist_add_head(&worker->hentry, bwh); | ||
| 1758 | worker->current_work = work; | ||
| 1759 | worker->current_cwq = cwq; | ||
| 1760 | work_color = get_work_color(work); | ||
| 1761 | |||
| 1762 | /* record the current cpu number in the work data and dequeue */ | ||
| 1763 | set_work_cpu(work, gcwq->cpu); | ||
| 1764 | list_del_init(&work->entry); | ||
| 1765 | |||
| 1766 | /* | ||
| 1767 | * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, | ||
| 1768 | * wake up another worker; otherwise, clear HIGHPRI_PENDING. | ||
| 1769 | */ | ||
| 1770 | if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { | ||
| 1771 | struct work_struct *nwork = list_first_entry(&gcwq->worklist, | ||
| 1772 | struct work_struct, entry); | ||
| 1773 | |||
| 1774 | if (!list_empty(&gcwq->worklist) && | ||
| 1775 | get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) | ||
| 1776 | wake_up_worker(gcwq); | ||
| 1777 | else | ||
| 1778 | gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; | ||
| 1779 | } | ||
| 1780 | |||
| 1781 | /* | ||
| 1782 | * CPU intensive works don't participate in concurrency | ||
| 1783 | * management. They're the scheduler's responsibility. | ||
| 1784 | */ | ||
| 1785 | if (unlikely(cpu_intensive)) | ||
| 1786 | worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); | ||
| 1787 | |||
| 1788 | spin_unlock_irq(&gcwq->lock); | ||
| 1789 | |||
| 1790 | work_clear_pending(work); | ||
| 1791 | lock_map_acquire(&cwq->wq->lockdep_map); | ||
| 1792 | lock_map_acquire(&lockdep_map); | ||
| 1793 | f(work); | ||
| 1794 | lock_map_release(&lockdep_map); | ||
| 1795 | lock_map_release(&cwq->wq->lockdep_map); | ||
| 1796 | |||
| 1797 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | ||
| 1798 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | ||
| 1799 | "%s/0x%08x/%d\n", | ||
| 1800 | current->comm, preempt_count(), task_pid_nr(current)); | ||
| 1801 | printk(KERN_ERR " last function: "); | ||
| 1802 | print_symbol("%s\n", (unsigned long)f); | ||
| 1803 | debug_show_held_locks(current); | ||
| 1804 | dump_stack(); | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | spin_lock_irq(&gcwq->lock); | ||
| 1808 | |||
| 1809 | /* clear cpu intensive status */ | ||
| 1810 | if (unlikely(cpu_intensive)) | ||
| 1811 | worker_clr_flags(worker, WORKER_CPU_INTENSIVE); | ||
| 1812 | |||
| 1813 | /* we're done with it, release */ | ||
| 1814 | hlist_del_init(&worker->hentry); | ||
| 1815 | worker->current_work = NULL; | ||
| 1816 | worker->current_cwq = NULL; | ||
| 1817 | cwq_dec_nr_in_flight(cwq, work_color); | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | /** | ||
| 1821 | * process_scheduled_works - process scheduled works | ||
| 1822 | * @worker: self | ||
| 1823 | * | ||
| 1824 | * Process all scheduled works. Please note that the scheduled list | ||
| 1825 | * may change while processing a work, so this function repeatedly | ||
| 1826 | * fetches a work from the top and executes it. | ||
| 1827 | * | ||
| 1828 | * CONTEXT: | ||
| 1829 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 1830 | * multiple times. | ||
| 1831 | */ | ||
| 1832 | static void process_scheduled_works(struct worker *worker) | ||
| 1833 | { | ||
| 1834 | while (!list_empty(&worker->scheduled)) { | ||
| 1835 | struct work_struct *work = list_first_entry(&worker->scheduled, | ||
| 1836 | struct work_struct, entry); | ||
| 1837 | process_one_work(worker, work); | ||
| 1838 | } | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | /** | ||
| 1842 | * worker_thread - the worker thread function | ||
| 1843 | * @__worker: self | ||
| 1844 | * | ||
| 1845 | * The gcwq worker thread function. There's a single dynamic pool of | ||
| 1846 | * these per each cpu. These workers process all works regardless of | ||
| 1847 | * their specific target workqueue. The only exception is works which | ||
| 1848 | * belong to workqueues with a rescuer which will be explained in | ||
| 1849 | * rescuer_thread(). | ||
| 1850 | */ | ||
| 1851 | static int worker_thread(void *__worker) | ||
| 1852 | { | ||
| 1853 | struct worker *worker = __worker; | ||
| 1854 | struct global_cwq *gcwq = worker->gcwq; | ||
| 1855 | |||
| 1856 | /* tell the scheduler that this is a workqueue worker */ | ||
| 1857 | worker->task->flags |= PF_WQ_WORKER; | ||
| 1858 | woke_up: | ||
| 1859 | spin_lock_irq(&gcwq->lock); | ||
| 1860 | |||
| 1861 | /* DIE can be set only while we're idle, checking here is enough */ | ||
| 1862 | if (worker->flags & WORKER_DIE) { | ||
| 1863 | spin_unlock_irq(&gcwq->lock); | ||
| 1864 | worker->task->flags &= ~PF_WQ_WORKER; | ||
| 1865 | return 0; | ||
| 1866 | } | ||
| 1867 | |||
| 1868 | worker_leave_idle(worker); | ||
| 1869 | recheck: | ||
| 1870 | /* no more worker necessary? */ | ||
| 1871 | if (!need_more_worker(gcwq)) | ||
| 1872 | goto sleep; | ||
| 1873 | |||
| 1874 | /* do we need to manage? */ | ||
| 1875 | if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) | ||
| 1876 | goto recheck; | ||
| 1877 | |||
| 1878 | /* | ||
| 1879 | * ->scheduled list can only be filled while a worker is | ||
| 1880 | * preparing to process a work or actually processing it. | ||
| 1881 | * Make sure nobody diddled with it while I was sleeping. | ||
| 1882 | */ | ||
| 1883 | BUG_ON(!list_empty(&worker->scheduled)); | ||
| 1884 | |||
| 1885 | /* | ||
| 1886 | * When control reaches this point, we're guaranteed to have | ||
| 1887 | * at least one idle worker or that someone else has already | ||
| 1888 | * assumed the manager role. | ||
| 1889 | */ | ||
| 1890 | worker_clr_flags(worker, WORKER_PREP); | ||
| 1891 | |||
| 1892 | do { | ||
| 1893 | struct work_struct *work = | ||
| 1894 | list_first_entry(&gcwq->worklist, | ||
| 1895 | struct work_struct, entry); | ||
| 1896 | |||
| 1897 | if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { | ||
| 1898 | /* optimization path, not strictly necessary */ | ||
| 1899 | process_one_work(worker, work); | ||
| 1900 | if (unlikely(!list_empty(&worker->scheduled))) | ||
| 1901 | process_scheduled_works(worker); | ||
| 1902 | } else { | ||
| 1903 | move_linked_works(work, &worker->scheduled, NULL); | ||
| 1904 | process_scheduled_works(worker); | ||
| 1905 | } | ||
| 1906 | } while (keep_working(gcwq)); | ||
| 1907 | |||
| 1908 | worker_set_flags(worker, WORKER_PREP, false); | ||
| 1909 | sleep: | ||
| 1910 | if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) | ||
| 1911 | goto recheck; | ||
| 1912 | |||
| 1913 | /* | ||
| 1914 | * gcwq->lock is held and there's no work to process and no | ||
| 1915 | * need to manage, sleep. Workers are woken up only while | ||
| 1916 | * holding gcwq->lock or from local cpu, so setting the | ||
| 1917 | * current state before releasing gcwq->lock is enough to | ||
| 1918 | * prevent losing any event. | ||
| 1919 | */ | ||
| 1920 | worker_enter_idle(worker); | ||
| 1921 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 1922 | spin_unlock_irq(&gcwq->lock); | ||
| 1923 | schedule(); | ||
| 1924 | goto woke_up; | ||
| 1925 | } | ||
| 1926 | |||
| 1927 | /** | ||
| 1928 | * rescuer_thread - the rescuer thread function | ||
| 1929 | * @__wq: the associated workqueue | ||
| 1930 | * | ||
| 1931 | * Workqueue rescuer thread function. There's one rescuer for each | ||
| 1932 | * workqueue which has WQ_RESCUER set. | ||
| 1933 | * | ||
| 1934 | * Regular work processing on a gcwq may block trying to create a new | ||
| 1935 | * worker which uses GFP_KERNEL allocation which has slight chance of | ||
| 1936 | * developing into deadlock if some works currently on the same queue | ||
| 1937 | * need to be processed to satisfy the GFP_KERNEL allocation. This is | ||
| 1938 | * the problem rescuer solves. | ||
| 1939 | * | ||
| 1940 | * When such condition is possible, the gcwq summons rescuers of all | ||
| 1941 | * workqueues which have works queued on the gcwq and let them process | ||
| 1942 | * those works so that forward progress can be guaranteed. | ||
| 1943 | * | ||
| 1944 | * This should happen rarely. | ||
| 1945 | */ | ||
| 1946 | static int rescuer_thread(void *__wq) | ||
| 1947 | { | ||
| 1948 | struct workqueue_struct *wq = __wq; | ||
| 1949 | struct worker *rescuer = wq->rescuer; | ||
| 1950 | struct list_head *scheduled = &rescuer->scheduled; | ||
| 1951 | bool is_unbound = wq->flags & WQ_UNBOUND; | ||
| 1952 | unsigned int cpu; | ||
| 1953 | |||
| 1954 | set_user_nice(current, RESCUER_NICE_LEVEL); | ||
| 1955 | repeat: | ||
| 1956 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1957 | |||
| 1958 | if (kthread_should_stop()) | ||
| 1959 | return 0; | ||
| 1960 | |||
| 1961 | /* | ||
| 1962 | * See whether any cpu is asking for help. Unbounded | ||
| 1963 | * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. | ||
| 1964 | */ | ||
| 1965 | for_each_mayday_cpu(cpu, wq->mayday_mask) { | ||
| 1966 | unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; | ||
| 1967 | struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); | ||
| 1968 | struct global_cwq *gcwq = cwq->gcwq; | ||
| 1969 | struct work_struct *work, *n; | ||
| 1970 | |||
| 1971 | __set_current_state(TASK_RUNNING); | ||
| 1972 | mayday_clear_cpu(cpu, wq->mayday_mask); | ||
| 1973 | |||
| 1974 | /* migrate to the target cpu if possible */ | ||
| 1975 | rescuer->gcwq = gcwq; | ||
| 1976 | worker_maybe_bind_and_lock(rescuer); | ||
| 1977 | |||
| 1978 | /* | ||
| 1979 | * Slurp in all works issued via this workqueue and | ||
| 1980 | * process'em. | ||
| 1981 | */ | ||
| 1982 | BUG_ON(!list_empty(&rescuer->scheduled)); | ||
| 1983 | list_for_each_entry_safe(work, n, &gcwq->worklist, entry) | ||
| 1984 | if (get_work_cwq(work) == cwq) | ||
| 1985 | move_linked_works(work, scheduled, &n); | ||
| 1986 | |||
| 1987 | process_scheduled_works(rescuer); | ||
| 1988 | spin_unlock_irq(&gcwq->lock); | ||
| 1989 | } | ||
| 1990 | |||
| 1991 | schedule(); | ||
| 1992 | goto repeat; | ||
| 459 | } | 1993 | } |
| 460 | 1994 | ||
| 461 | struct wq_barrier { | 1995 | struct wq_barrier { |
| @@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work) | |||
| 469 | complete(&barr->done); | 2003 | complete(&barr->done); |
| 470 | } | 2004 | } |
| 471 | 2005 | ||
| 2006 | /** | ||
| 2007 | * insert_wq_barrier - insert a barrier work | ||
| 2008 | * @cwq: cwq to insert barrier into | ||
| 2009 | * @barr: wq_barrier to insert | ||
| 2010 | * @target: target work to attach @barr to | ||
| 2011 | * @worker: worker currently executing @target, NULL if @target is not executing | ||
| 2012 | * | ||
| 2013 | * @barr is linked to @target such that @barr is completed only after | ||
| 2014 | * @target finishes execution. Please note that the ordering | ||
| 2015 | * guarantee is observed only with respect to @target and on the local | ||
| 2016 | * cpu. | ||
| 2017 | * | ||
| 2018 | * Currently, a queued barrier can't be canceled. This is because | ||
| 2019 | * try_to_grab_pending() can't determine whether the work to be | ||
| 2020 | * grabbed is at the head of the queue and thus can't clear LINKED | ||
| 2021 | * flag of the previous work while there must be a valid next work | ||
| 2022 | * after a work with LINKED flag set. | ||
| 2023 | * | ||
| 2024 | * Note that when @worker is non-NULL, @target may be modified | ||
| 2025 | * underneath us, so we can't reliably determine cwq from @target. | ||
| 2026 | * | ||
| 2027 | * CONTEXT: | ||
| 2028 | * spin_lock_irq(gcwq->lock). | ||
| 2029 | */ | ||
| 472 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, | 2030 | static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, |
| 473 | struct wq_barrier *barr, struct list_head *head) | 2031 | struct wq_barrier *barr, |
| 2032 | struct work_struct *target, struct worker *worker) | ||
| 474 | { | 2033 | { |
| 2034 | struct list_head *head; | ||
| 2035 | unsigned int linked = 0; | ||
| 2036 | |||
| 475 | /* | 2037 | /* |
| 476 | * debugobject calls are safe here even with cwq->lock locked | 2038 | * debugobject calls are safe here even with gcwq->lock locked |
| 477 | * as we know for sure that this will not trigger any of the | 2039 | * as we know for sure that this will not trigger any of the |
| 478 | * checks and call back into the fixup functions where we | 2040 | * checks and call back into the fixup functions where we |
| 479 | * might deadlock. | 2041 | * might deadlock. |
| 480 | */ | 2042 | */ |
| 481 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); | 2043 | INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); |
| 482 | __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); | 2044 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); |
| 483 | |||
| 484 | init_completion(&barr->done); | 2045 | init_completion(&barr->done); |
| 485 | 2046 | ||
| 2047 | /* | ||
| 2048 | * If @target is currently being executed, schedule the | ||
| 2049 | * barrier to the worker; otherwise, put it after @target. | ||
| 2050 | */ | ||
| 2051 | if (worker) | ||
| 2052 | head = worker->scheduled.next; | ||
| 2053 | else { | ||
| 2054 | unsigned long *bits = work_data_bits(target); | ||
| 2055 | |||
| 2056 | head = target->entry.next; | ||
| 2057 | /* there can already be other linked works, inherit and set */ | ||
| 2058 | linked = *bits & WORK_STRUCT_LINKED; | ||
| 2059 | __set_bit(WORK_STRUCT_LINKED_BIT, bits); | ||
| 2060 | } | ||
| 2061 | |||
| 486 | debug_work_activate(&barr->work); | 2062 | debug_work_activate(&barr->work); |
| 487 | insert_work(cwq, &barr->work, head); | 2063 | insert_work(cwq, &barr->work, head, |
| 2064 | work_color_to_flags(WORK_NO_COLOR) | linked); | ||
| 488 | } | 2065 | } |
| 489 | 2066 | ||
| 490 | static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | 2067 | /** |
| 2068 | * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing | ||
| 2069 | * @wq: workqueue being flushed | ||
| 2070 | * @flush_color: new flush color, < 0 for no-op | ||
| 2071 | * @work_color: new work color, < 0 for no-op | ||
| 2072 | * | ||
| 2073 | * Prepare cwqs for workqueue flushing. | ||
| 2074 | * | ||
| 2075 | * If @flush_color is non-negative, flush_color on all cwqs should be | ||
| 2076 | * -1. If no cwq has in-flight commands at the specified color, all | ||
| 2077 | * cwq->flush_color's stay at -1 and %false is returned. If any cwq | ||
| 2078 | * has in flight commands, its cwq->flush_color is set to | ||
| 2079 | * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq | ||
| 2080 | * wakeup logic is armed and %true is returned. | ||
| 2081 | * | ||
| 2082 | * The caller should have initialized @wq->first_flusher prior to | ||
| 2083 | * calling this function with non-negative @flush_color. If | ||
| 2084 | * @flush_color is negative, no flush color update is done and %false | ||
| 2085 | * is returned. | ||
| 2086 | * | ||
| 2087 | * If @work_color is non-negative, all cwqs should have the same | ||
| 2088 | * work_color which is previous to @work_color and all will be | ||
| 2089 | * advanced to @work_color. | ||
| 2090 | * | ||
| 2091 | * CONTEXT: | ||
| 2092 | * mutex_lock(wq->flush_mutex). | ||
| 2093 | * | ||
| 2094 | * RETURNS: | ||
| 2095 | * %true if @flush_color >= 0 and there's something to flush. %false | ||
| 2096 | * otherwise. | ||
| 2097 | */ | ||
| 2098 | static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, | ||
| 2099 | int flush_color, int work_color) | ||
| 491 | { | 2100 | { |
| 492 | int active = 0; | 2101 | bool wait = false; |
| 493 | struct wq_barrier barr; | 2102 | unsigned int cpu; |
| 494 | 2103 | ||
| 495 | WARN_ON(cwq->thread == current); | 2104 | if (flush_color >= 0) { |
| 496 | 2105 | BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); | |
| 497 | spin_lock_irq(&cwq->lock); | 2106 | atomic_set(&wq->nr_cwqs_to_flush, 1); |
| 498 | if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { | ||
| 499 | insert_wq_barrier(cwq, &barr, &cwq->worklist); | ||
| 500 | active = 1; | ||
| 501 | } | 2107 | } |
| 502 | spin_unlock_irq(&cwq->lock); | ||
| 503 | 2108 | ||
| 504 | if (active) { | 2109 | for_each_cwq_cpu(cpu, wq) { |
| 505 | wait_for_completion(&barr.done); | 2110 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 506 | destroy_work_on_stack(&barr.work); | 2111 | struct global_cwq *gcwq = cwq->gcwq; |
| 2112 | |||
| 2113 | spin_lock_irq(&gcwq->lock); | ||
| 2114 | |||
| 2115 | if (flush_color >= 0) { | ||
| 2116 | BUG_ON(cwq->flush_color != -1); | ||
| 2117 | |||
| 2118 | if (cwq->nr_in_flight[flush_color]) { | ||
| 2119 | cwq->flush_color = flush_color; | ||
| 2120 | atomic_inc(&wq->nr_cwqs_to_flush); | ||
| 2121 | wait = true; | ||
| 2122 | } | ||
| 2123 | } | ||
| 2124 | |||
| 2125 | if (work_color >= 0) { | ||
| 2126 | BUG_ON(work_color != work_next_color(cwq->work_color)); | ||
| 2127 | cwq->work_color = work_color; | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | spin_unlock_irq(&gcwq->lock); | ||
| 507 | } | 2131 | } |
| 508 | 2132 | ||
| 509 | return active; | 2133 | if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) |
| 2134 | complete(&wq->first_flusher->done); | ||
| 2135 | |||
| 2136 | return wait; | ||
| 510 | } | 2137 | } |
| 511 | 2138 | ||
| 512 | /** | 2139 | /** |
| @@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | |||
| 518 | * | 2145 | * |
| 519 | * We sleep until all works which were queued on entry have been handled, | 2146 | * We sleep until all works which were queued on entry have been handled, |
| 520 | * but we are not livelocked by new incoming ones. | 2147 | * but we are not livelocked by new incoming ones. |
| 521 | * | ||
| 522 | * This function used to run the workqueues itself. Now we just wait for the | ||
| 523 | * helper threads to do it. | ||
| 524 | */ | 2148 | */ |
| 525 | void flush_workqueue(struct workqueue_struct *wq) | 2149 | void flush_workqueue(struct workqueue_struct *wq) |
| 526 | { | 2150 | { |
| 527 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2151 | struct wq_flusher this_flusher = { |
| 528 | int cpu; | 2152 | .list = LIST_HEAD_INIT(this_flusher.list), |
| 2153 | .flush_color = -1, | ||
| 2154 | .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), | ||
| 2155 | }; | ||
| 2156 | int next_color; | ||
| 529 | 2157 | ||
| 530 | might_sleep(); | ||
| 531 | lock_map_acquire(&wq->lockdep_map); | 2158 | lock_map_acquire(&wq->lockdep_map); |
| 532 | lock_map_release(&wq->lockdep_map); | 2159 | lock_map_release(&wq->lockdep_map); |
| 533 | for_each_cpu(cpu, cpu_map) | 2160 | |
| 534 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); | 2161 | mutex_lock(&wq->flush_mutex); |
| 2162 | |||
| 2163 | /* | ||
| 2164 | * Start-to-wait phase | ||
| 2165 | */ | ||
| 2166 | next_color = work_next_color(wq->work_color); | ||
| 2167 | |||
| 2168 | if (next_color != wq->flush_color) { | ||
| 2169 | /* | ||
| 2170 | * Color space is not full. The current work_color | ||
| 2171 | * becomes our flush_color and work_color is advanced | ||
| 2172 | * by one. | ||
| 2173 | */ | ||
| 2174 | BUG_ON(!list_empty(&wq->flusher_overflow)); | ||
| 2175 | this_flusher.flush_color = wq->work_color; | ||
| 2176 | wq->work_color = next_color; | ||
| 2177 | |||
| 2178 | if (!wq->first_flusher) { | ||
| 2179 | /* no flush in progress, become the first flusher */ | ||
| 2180 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
| 2181 | |||
| 2182 | wq->first_flusher = &this_flusher; | ||
| 2183 | |||
| 2184 | if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, | ||
| 2185 | wq->work_color)) { | ||
| 2186 | /* nothing to flush, done */ | ||
| 2187 | wq->flush_color = next_color; | ||
| 2188 | wq->first_flusher = NULL; | ||
| 2189 | goto out_unlock; | ||
| 2190 | } | ||
| 2191 | } else { | ||
| 2192 | /* wait in queue */ | ||
| 2193 | BUG_ON(wq->flush_color == this_flusher.flush_color); | ||
| 2194 | list_add_tail(&this_flusher.list, &wq->flusher_queue); | ||
| 2195 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
| 2196 | } | ||
| 2197 | } else { | ||
| 2198 | /* | ||
| 2199 | * Oops, color space is full, wait on overflow queue. | ||
| 2200 | * The next flush completion will assign us | ||
| 2201 | * flush_color and transfer to flusher_queue. | ||
| 2202 | */ | ||
| 2203 | list_add_tail(&this_flusher.list, &wq->flusher_overflow); | ||
| 2204 | } | ||
| 2205 | |||
| 2206 | mutex_unlock(&wq->flush_mutex); | ||
| 2207 | |||
| 2208 | wait_for_completion(&this_flusher.done); | ||
| 2209 | |||
| 2210 | /* | ||
| 2211 | * Wake-up-and-cascade phase | ||
| 2212 | * | ||
| 2213 | * First flushers are responsible for cascading flushes and | ||
| 2214 | * handling overflow. Non-first flushers can simply return. | ||
| 2215 | */ | ||
| 2216 | if (wq->first_flusher != &this_flusher) | ||
| 2217 | return; | ||
| 2218 | |||
| 2219 | mutex_lock(&wq->flush_mutex); | ||
| 2220 | |||
| 2221 | /* we might have raced, check again with mutex held */ | ||
| 2222 | if (wq->first_flusher != &this_flusher) | ||
| 2223 | goto out_unlock; | ||
| 2224 | |||
| 2225 | wq->first_flusher = NULL; | ||
| 2226 | |||
| 2227 | BUG_ON(!list_empty(&this_flusher.list)); | ||
| 2228 | BUG_ON(wq->flush_color != this_flusher.flush_color); | ||
| 2229 | |||
| 2230 | while (true) { | ||
| 2231 | struct wq_flusher *next, *tmp; | ||
| 2232 | |||
| 2233 | /* complete all the flushers sharing the current flush color */ | ||
| 2234 | list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { | ||
| 2235 | if (next->flush_color != wq->flush_color) | ||
| 2236 | break; | ||
| 2237 | list_del_init(&next->list); | ||
| 2238 | complete(&next->done); | ||
| 2239 | } | ||
| 2240 | |||
| 2241 | BUG_ON(!list_empty(&wq->flusher_overflow) && | ||
| 2242 | wq->flush_color != work_next_color(wq->work_color)); | ||
| 2243 | |||
| 2244 | /* this flush_color is finished, advance by one */ | ||
| 2245 | wq->flush_color = work_next_color(wq->flush_color); | ||
| 2246 | |||
| 2247 | /* one color has been freed, handle overflow queue */ | ||
| 2248 | if (!list_empty(&wq->flusher_overflow)) { | ||
| 2249 | /* | ||
| 2250 | * Assign the same color to all overflowed | ||
| 2251 | * flushers, advance work_color and append to | ||
| 2252 | * flusher_queue. This is the start-to-wait | ||
| 2253 | * phase for these overflowed flushers. | ||
| 2254 | */ | ||
| 2255 | list_for_each_entry(tmp, &wq->flusher_overflow, list) | ||
| 2256 | tmp->flush_color = wq->work_color; | ||
| 2257 | |||
| 2258 | wq->work_color = work_next_color(wq->work_color); | ||
| 2259 | |||
| 2260 | list_splice_tail_init(&wq->flusher_overflow, | ||
| 2261 | &wq->flusher_queue); | ||
| 2262 | flush_workqueue_prep_cwqs(wq, -1, wq->work_color); | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | if (list_empty(&wq->flusher_queue)) { | ||
| 2266 | BUG_ON(wq->flush_color != wq->work_color); | ||
| 2267 | break; | ||
| 2268 | } | ||
| 2269 | |||
| 2270 | /* | ||
| 2271 | * Need to flush more colors. Make the next flusher | ||
| 2272 | * the new first flusher and arm cwqs. | ||
| 2273 | */ | ||
| 2274 | BUG_ON(wq->flush_color == wq->work_color); | ||
| 2275 | BUG_ON(wq->flush_color != next->flush_color); | ||
| 2276 | |||
| 2277 | list_del_init(&next->list); | ||
| 2278 | wq->first_flusher = next; | ||
| 2279 | |||
| 2280 | if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) | ||
| 2281 | break; | ||
| 2282 | |||
| 2283 | /* | ||
| 2284 | * Meh... this color is already done, clear first | ||
| 2285 | * flusher and repeat cascading. | ||
| 2286 | */ | ||
| 2287 | wq->first_flusher = NULL; | ||
| 2288 | } | ||
| 2289 | |||
| 2290 | out_unlock: | ||
| 2291 | mutex_unlock(&wq->flush_mutex); | ||
| 535 | } | 2292 | } |
| 536 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2293 | EXPORT_SYMBOL_GPL(flush_workqueue); |
| 537 | 2294 | ||
| @@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); | |||
| 547 | */ | 2304 | */ |
| 548 | int flush_work(struct work_struct *work) | 2305 | int flush_work(struct work_struct *work) |
| 549 | { | 2306 | { |
| 2307 | struct worker *worker = NULL; | ||
| 2308 | struct global_cwq *gcwq; | ||
| 550 | struct cpu_workqueue_struct *cwq; | 2309 | struct cpu_workqueue_struct *cwq; |
| 551 | struct list_head *prev; | ||
| 552 | struct wq_barrier barr; | 2310 | struct wq_barrier barr; |
| 553 | 2311 | ||
| 554 | might_sleep(); | 2312 | might_sleep(); |
| 555 | cwq = get_wq_data(work); | 2313 | gcwq = get_work_gcwq(work); |
| 556 | if (!cwq) | 2314 | if (!gcwq) |
| 557 | return 0; | 2315 | return 0; |
| 558 | 2316 | ||
| 559 | lock_map_acquire(&cwq->wq->lockdep_map); | 2317 | spin_lock_irq(&gcwq->lock); |
| 560 | lock_map_release(&cwq->wq->lockdep_map); | ||
| 561 | |||
| 562 | prev = NULL; | ||
| 563 | spin_lock_irq(&cwq->lock); | ||
| 564 | if (!list_empty(&work->entry)) { | 2318 | if (!list_empty(&work->entry)) { |
| 565 | /* | 2319 | /* |
| 566 | * See the comment near try_to_grab_pending()->smp_rmb(). | 2320 | * See the comment near try_to_grab_pending()->smp_rmb(). |
| 567 | * If it was re-queued under us we are not going to wait. | 2321 | * If it was re-queued to a different gcwq under us, we |
| 2322 | * are not going to wait. | ||
| 568 | */ | 2323 | */ |
| 569 | smp_rmb(); | 2324 | smp_rmb(); |
| 570 | if (unlikely(cwq != get_wq_data(work))) | 2325 | cwq = get_work_cwq(work); |
| 571 | goto out; | 2326 | if (unlikely(!cwq || gcwq != cwq->gcwq)) |
| 572 | prev = &work->entry; | 2327 | goto already_gone; |
| 573 | } else { | 2328 | } else { |
| 574 | if (cwq->current_work != work) | 2329 | worker = find_worker_executing_work(gcwq, work); |
| 575 | goto out; | 2330 | if (!worker) |
| 576 | prev = &cwq->worklist; | 2331 | goto already_gone; |
| 2332 | cwq = worker->current_cwq; | ||
| 577 | } | 2333 | } |
| 578 | insert_wq_barrier(cwq, &barr, prev->next); | 2334 | |
| 579 | out: | 2335 | insert_wq_barrier(cwq, &barr, work, worker); |
| 580 | spin_unlock_irq(&cwq->lock); | 2336 | spin_unlock_irq(&gcwq->lock); |
| 581 | if (!prev) | 2337 | |
| 582 | return 0; | 2338 | lock_map_acquire(&cwq->wq->lockdep_map); |
| 2339 | lock_map_release(&cwq->wq->lockdep_map); | ||
| 583 | 2340 | ||
| 584 | wait_for_completion(&barr.done); | 2341 | wait_for_completion(&barr.done); |
| 585 | destroy_work_on_stack(&barr.work); | 2342 | destroy_work_on_stack(&barr.work); |
| 586 | return 1; | 2343 | return 1; |
| 2344 | already_gone: | ||
| 2345 | spin_unlock_irq(&gcwq->lock); | ||
| 2346 | return 0; | ||
| 587 | } | 2347 | } |
| 588 | EXPORT_SYMBOL_GPL(flush_work); | 2348 | EXPORT_SYMBOL_GPL(flush_work); |
| 589 | 2349 | ||
| @@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work); | |||
| 593 | */ | 2353 | */ |
| 594 | static int try_to_grab_pending(struct work_struct *work) | 2354 | static int try_to_grab_pending(struct work_struct *work) |
| 595 | { | 2355 | { |
| 596 | struct cpu_workqueue_struct *cwq; | 2356 | struct global_cwq *gcwq; |
| 597 | int ret = -1; | 2357 | int ret = -1; |
| 598 | 2358 | ||
| 599 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) | 2359 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) |
| 600 | return 0; | 2360 | return 0; |
| 601 | 2361 | ||
| 602 | /* | 2362 | /* |
| 603 | * The queueing is in progress, or it is already queued. Try to | 2363 | * The queueing is in progress, or it is already queued. Try to |
| 604 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | 2364 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. |
| 605 | */ | 2365 | */ |
| 606 | 2366 | gcwq = get_work_gcwq(work); | |
| 607 | cwq = get_wq_data(work); | 2367 | if (!gcwq) |
| 608 | if (!cwq) | ||
| 609 | return ret; | 2368 | return ret; |
| 610 | 2369 | ||
| 611 | spin_lock_irq(&cwq->lock); | 2370 | spin_lock_irq(&gcwq->lock); |
| 612 | if (!list_empty(&work->entry)) { | 2371 | if (!list_empty(&work->entry)) { |
| 613 | /* | 2372 | /* |
| 614 | * This work is queued, but perhaps we locked the wrong cwq. | 2373 | * This work is queued, but perhaps we locked the wrong gcwq. |
| 615 | * In that case we must see the new value after rmb(), see | 2374 | * In that case we must see the new value after rmb(), see |
| 616 | * insert_work()->wmb(). | 2375 | * insert_work()->wmb(). |
| 617 | */ | 2376 | */ |
| 618 | smp_rmb(); | 2377 | smp_rmb(); |
| 619 | if (cwq == get_wq_data(work)) { | 2378 | if (gcwq == get_work_gcwq(work)) { |
| 620 | debug_work_deactivate(work); | 2379 | debug_work_deactivate(work); |
| 621 | list_del_init(&work->entry); | 2380 | list_del_init(&work->entry); |
| 2381 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
| 2382 | get_work_color(work)); | ||
| 622 | ret = 1; | 2383 | ret = 1; |
| 623 | } | 2384 | } |
| 624 | } | 2385 | } |
| 625 | spin_unlock_irq(&cwq->lock); | 2386 | spin_unlock_irq(&gcwq->lock); |
| 626 | 2387 | ||
| 627 | return ret; | 2388 | return ret; |
| 628 | } | 2389 | } |
| 629 | 2390 | ||
| 630 | static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, | 2391 | static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) |
| 631 | struct work_struct *work) | ||
| 632 | { | 2392 | { |
| 633 | struct wq_barrier barr; | 2393 | struct wq_barrier barr; |
| 634 | int running = 0; | 2394 | struct worker *worker; |
| 635 | 2395 | ||
| 636 | spin_lock_irq(&cwq->lock); | 2396 | spin_lock_irq(&gcwq->lock); |
| 637 | if (unlikely(cwq->current_work == work)) { | 2397 | |
| 638 | insert_wq_barrier(cwq, &barr, cwq->worklist.next); | 2398 | worker = find_worker_executing_work(gcwq, work); |
| 639 | running = 1; | 2399 | if (unlikely(worker)) |
| 640 | } | 2400 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); |
| 641 | spin_unlock_irq(&cwq->lock); | ||
| 642 | 2401 | ||
| 643 | if (unlikely(running)) { | 2402 | spin_unlock_irq(&gcwq->lock); |
| 2403 | |||
| 2404 | if (unlikely(worker)) { | ||
| 644 | wait_for_completion(&barr.done); | 2405 | wait_for_completion(&barr.done); |
| 645 | destroy_work_on_stack(&barr.work); | 2406 | destroy_work_on_stack(&barr.work); |
| 646 | } | 2407 | } |
| @@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, | |||
| 648 | 2409 | ||
| 649 | static void wait_on_work(struct work_struct *work) | 2410 | static void wait_on_work(struct work_struct *work) |
| 650 | { | 2411 | { |
| 651 | struct cpu_workqueue_struct *cwq; | ||
| 652 | struct workqueue_struct *wq; | ||
| 653 | const struct cpumask *cpu_map; | ||
| 654 | int cpu; | 2412 | int cpu; |
| 655 | 2413 | ||
| 656 | might_sleep(); | 2414 | might_sleep(); |
| @@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work) | |||
| 658 | lock_map_acquire(&work->lockdep_map); | 2416 | lock_map_acquire(&work->lockdep_map); |
| 659 | lock_map_release(&work->lockdep_map); | 2417 | lock_map_release(&work->lockdep_map); |
| 660 | 2418 | ||
| 661 | cwq = get_wq_data(work); | 2419 | for_each_gcwq_cpu(cpu) |
| 662 | if (!cwq) | 2420 | wait_on_cpu_work(get_gcwq(cpu), work); |
| 663 | return; | ||
| 664 | |||
| 665 | wq = cwq->wq; | ||
| 666 | cpu_map = wq_cpu_map(wq); | ||
| 667 | |||
| 668 | for_each_cpu(cpu, cpu_map) | ||
| 669 | wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | ||
| 670 | } | 2421 | } |
| 671 | 2422 | ||
| 672 | static int __cancel_work_timer(struct work_struct *work, | 2423 | static int __cancel_work_timer(struct work_struct *work, |
| @@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work, | |||
| 681 | wait_on_work(work); | 2432 | wait_on_work(work); |
| 682 | } while (unlikely(ret < 0)); | 2433 | } while (unlikely(ret < 0)); |
| 683 | 2434 | ||
| 684 | clear_wq_data(work); | 2435 | clear_work_data(work); |
| 685 | return ret; | 2436 | return ret; |
| 686 | } | 2437 | } |
| 687 | 2438 | ||
| @@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) | |||
| 727 | } | 2478 | } |
| 728 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 2479 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
| 729 | 2480 | ||
| 730 | static struct workqueue_struct *keventd_wq __read_mostly; | ||
| 731 | |||
| 732 | /** | 2481 | /** |
| 733 | * schedule_work - put work task in global workqueue | 2482 | * schedule_work - put work task in global workqueue |
| 734 | * @work: job to be done | 2483 | * @work: job to be done |
| @@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; | |||
| 742 | */ | 2491 | */ |
| 743 | int schedule_work(struct work_struct *work) | 2492 | int schedule_work(struct work_struct *work) |
| 744 | { | 2493 | { |
| 745 | return queue_work(keventd_wq, work); | 2494 | return queue_work(system_wq, work); |
| 746 | } | 2495 | } |
| 747 | EXPORT_SYMBOL(schedule_work); | 2496 | EXPORT_SYMBOL(schedule_work); |
| 748 | 2497 | ||
| @@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work); | |||
| 755 | */ | 2504 | */ |
| 756 | int schedule_work_on(int cpu, struct work_struct *work) | 2505 | int schedule_work_on(int cpu, struct work_struct *work) |
| 757 | { | 2506 | { |
| 758 | return queue_work_on(cpu, keventd_wq, work); | 2507 | return queue_work_on(cpu, system_wq, work); |
| 759 | } | 2508 | } |
| 760 | EXPORT_SYMBOL(schedule_work_on); | 2509 | EXPORT_SYMBOL(schedule_work_on); |
| 761 | 2510 | ||
| @@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on); | |||
| 770 | int schedule_delayed_work(struct delayed_work *dwork, | 2519 | int schedule_delayed_work(struct delayed_work *dwork, |
| 771 | unsigned long delay) | 2520 | unsigned long delay) |
| 772 | { | 2521 | { |
| 773 | return queue_delayed_work(keventd_wq, dwork, delay); | 2522 | return queue_delayed_work(system_wq, dwork, delay); |
| 774 | } | 2523 | } |
| 775 | EXPORT_SYMBOL(schedule_delayed_work); | 2524 | EXPORT_SYMBOL(schedule_delayed_work); |
| 776 | 2525 | ||
| @@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
| 783 | void flush_delayed_work(struct delayed_work *dwork) | 2532 | void flush_delayed_work(struct delayed_work *dwork) |
| 784 | { | 2533 | { |
| 785 | if (del_timer_sync(&dwork->timer)) { | 2534 | if (del_timer_sync(&dwork->timer)) { |
| 786 | struct cpu_workqueue_struct *cwq; | 2535 | __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, |
| 787 | cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); | 2536 | &dwork->work); |
| 788 | __queue_work(cwq, &dwork->work); | ||
| 789 | put_cpu(); | 2537 | put_cpu(); |
| 790 | } | 2538 | } |
| 791 | flush_work(&dwork->work); | 2539 | flush_work(&dwork->work); |
| @@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
| 804 | int schedule_delayed_work_on(int cpu, | 2552 | int schedule_delayed_work_on(int cpu, |
| 805 | struct delayed_work *dwork, unsigned long delay) | 2553 | struct delayed_work *dwork, unsigned long delay) |
| 806 | { | 2554 | { |
| 807 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); | 2555 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); |
| 808 | } | 2556 | } |
| 809 | EXPORT_SYMBOL(schedule_delayed_work_on); | 2557 | EXPORT_SYMBOL(schedule_delayed_work_on); |
| 810 | 2558 | ||
| @@ -820,7 +2568,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
| 820 | int schedule_on_each_cpu(work_func_t func) | 2568 | int schedule_on_each_cpu(work_func_t func) |
| 821 | { | 2569 | { |
| 822 | int cpu; | 2570 | int cpu; |
| 823 | int orig = -1; | ||
| 824 | struct work_struct *works; | 2571 | struct work_struct *works; |
| 825 | 2572 | ||
| 826 | works = alloc_percpu(struct work_struct); | 2573 | works = alloc_percpu(struct work_struct); |
| @@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func) | |||
| 829 | 2576 | ||
| 830 | get_online_cpus(); | 2577 | get_online_cpus(); |
| 831 | 2578 | ||
| 832 | /* | ||
| 833 | * When running in keventd don't schedule a work item on | ||
| 834 | * itself. Can just call directly because the work queue is | ||
| 835 | * already bound. This also is faster. | ||
| 836 | */ | ||
| 837 | if (current_is_keventd()) | ||
| 838 | orig = raw_smp_processor_id(); | ||
| 839 | |||
| 840 | for_each_online_cpu(cpu) { | 2579 | for_each_online_cpu(cpu) { |
| 841 | struct work_struct *work = per_cpu_ptr(works, cpu); | 2580 | struct work_struct *work = per_cpu_ptr(works, cpu); |
| 842 | 2581 | ||
| 843 | INIT_WORK(work, func); | 2582 | INIT_WORK(work, func); |
| 844 | if (cpu != orig) | 2583 | schedule_work_on(cpu, work); |
| 845 | schedule_work_on(cpu, work); | ||
| 846 | } | 2584 | } |
| 847 | if (orig >= 0) | ||
| 848 | func(per_cpu_ptr(works, orig)); | ||
| 849 | 2585 | ||
| 850 | for_each_online_cpu(cpu) | 2586 | for_each_online_cpu(cpu) |
| 851 | flush_work(per_cpu_ptr(works, cpu)); | 2587 | flush_work(per_cpu_ptr(works, cpu)); |
| @@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
| 881 | */ | 2617 | */ |
| 882 | void flush_scheduled_work(void) | 2618 | void flush_scheduled_work(void) |
| 883 | { | 2619 | { |
| 884 | flush_workqueue(keventd_wq); | 2620 | flush_workqueue(system_wq); |
| 885 | } | 2621 | } |
| 886 | EXPORT_SYMBOL(flush_scheduled_work); | 2622 | EXPORT_SYMBOL(flush_scheduled_work); |
| 887 | 2623 | ||
| @@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); | |||
| 913 | 2649 | ||
| 914 | int keventd_up(void) | 2650 | int keventd_up(void) |
| 915 | { | 2651 | { |
| 916 | return keventd_wq != NULL; | 2652 | return system_wq != NULL; |
| 917 | } | 2653 | } |
| 918 | 2654 | ||
| 919 | int current_is_keventd(void) | 2655 | static int alloc_cwqs(struct workqueue_struct *wq) |
| 920 | { | 2656 | { |
| 921 | struct cpu_workqueue_struct *cwq; | 2657 | /* |
| 922 | int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ | 2658 | * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. |
| 923 | int ret = 0; | 2659 | * Make sure that the alignment isn't lower than that of |
| 924 | 2660 | * unsigned long long. | |
| 925 | BUG_ON(!keventd_wq); | 2661 | */ |
| 2662 | const size_t size = sizeof(struct cpu_workqueue_struct); | ||
| 2663 | const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, | ||
| 2664 | __alignof__(unsigned long long)); | ||
| 2665 | #ifdef CONFIG_SMP | ||
| 2666 | bool percpu = !(wq->flags & WQ_UNBOUND); | ||
| 2667 | #else | ||
| 2668 | bool percpu = false; | ||
| 2669 | #endif | ||
| 926 | 2670 | ||
| 927 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); | 2671 | if (percpu) |
| 928 | if (current == cwq->thread) | 2672 | wq->cpu_wq.pcpu = __alloc_percpu(size, align); |
| 929 | ret = 1; | 2673 | else { |
| 2674 | void *ptr; | ||
| 930 | 2675 | ||
| 931 | return ret; | 2676 | /* |
| 2677 | * Allocate enough room to align cwq and put an extra | ||
| 2678 | * pointer at the end pointing back to the originally | ||
| 2679 | * allocated pointer which will be used for free. | ||
| 2680 | */ | ||
| 2681 | ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); | ||
| 2682 | if (ptr) { | ||
| 2683 | wq->cpu_wq.single = PTR_ALIGN(ptr, align); | ||
| 2684 | *(void **)(wq->cpu_wq.single + 1) = ptr; | ||
| 2685 | } | ||
| 2686 | } | ||
| 932 | 2687 | ||
| 2688 | /* just in case, make sure it's actually aligned */ | ||
| 2689 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | ||
| 2690 | return wq->cpu_wq.v ? 0 : -ENOMEM; | ||
| 933 | } | 2691 | } |
| 934 | 2692 | ||
| 935 | static struct cpu_workqueue_struct * | 2693 | static void free_cwqs(struct workqueue_struct *wq) |
| 936 | init_cpu_workqueue(struct workqueue_struct *wq, int cpu) | ||
| 937 | { | 2694 | { |
| 938 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 2695 | #ifdef CONFIG_SMP |
| 939 | 2696 | bool percpu = !(wq->flags & WQ_UNBOUND); | |
| 940 | cwq->wq = wq; | 2697 | #else |
| 941 | spin_lock_init(&cwq->lock); | 2698 | bool percpu = false; |
| 942 | INIT_LIST_HEAD(&cwq->worklist); | 2699 | #endif |
| 943 | init_waitqueue_head(&cwq->more_work); | ||
| 944 | 2700 | ||
| 945 | return cwq; | 2701 | if (percpu) |
| 2702 | free_percpu(wq->cpu_wq.pcpu); | ||
| 2703 | else if (wq->cpu_wq.single) { | ||
| 2704 | /* the pointer to free is stored right after the cwq */ | ||
| 2705 | kfree(*(void **)(wq->cpu_wq.single + 1)); | ||
| 2706 | } | ||
| 946 | } | 2707 | } |
| 947 | 2708 | ||
| 948 | static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2709 | static int wq_clamp_max_active(int max_active, unsigned int flags, |
| 2710 | const char *name) | ||
| 949 | { | 2711 | { |
| 950 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 2712 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; |
| 951 | struct workqueue_struct *wq = cwq->wq; | ||
| 952 | const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; | ||
| 953 | struct task_struct *p; | ||
| 954 | 2713 | ||
| 955 | p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); | 2714 | if (max_active < 1 || max_active > lim) |
| 956 | /* | 2715 | printk(KERN_WARNING "workqueue: max_active %d requested for %s " |
| 957 | * Nobody can add the work_struct to this cwq, | 2716 | "is out of range, clamping between %d and %d\n", |
| 958 | * if (caller is __create_workqueue) | 2717 | max_active, name, 1, lim); |
| 959 | * nobody should see this wq | ||
| 960 | * else // caller is CPU_UP_PREPARE | ||
| 961 | * cpu is not on cpu_online_map | ||
| 962 | * so we can abort safely. | ||
| 963 | */ | ||
| 964 | if (IS_ERR(p)) | ||
| 965 | return PTR_ERR(p); | ||
| 966 | if (cwq->wq->rt) | ||
| 967 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
| 968 | cwq->thread = p; | ||
| 969 | 2718 | ||
| 970 | trace_workqueue_creation(cwq->thread, cpu); | 2719 | return clamp_val(max_active, 1, lim); |
| 971 | |||
| 972 | return 0; | ||
| 973 | } | 2720 | } |
| 974 | 2721 | ||
| 975 | static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | 2722 | struct workqueue_struct *__alloc_workqueue_key(const char *name, |
| 2723 | unsigned int flags, | ||
| 2724 | int max_active, | ||
| 2725 | struct lock_class_key *key, | ||
| 2726 | const char *lock_name) | ||
| 976 | { | 2727 | { |
| 977 | struct task_struct *p = cwq->thread; | 2728 | struct workqueue_struct *wq; |
| 2729 | unsigned int cpu; | ||
| 978 | 2730 | ||
| 979 | if (p != NULL) { | 2731 | /* |
| 980 | if (cpu >= 0) | 2732 | * Unbound workqueues aren't concurrency managed and should be |
| 981 | kthread_bind(p, cpu); | 2733 | * dispatched to workers immediately. |
| 982 | wake_up_process(p); | 2734 | */ |
| 983 | } | 2735 | if (flags & WQ_UNBOUND) |
| 984 | } | 2736 | flags |= WQ_HIGHPRI; |
| 985 | 2737 | ||
| 986 | struct workqueue_struct *__create_workqueue_key(const char *name, | 2738 | max_active = max_active ?: WQ_DFL_ACTIVE; |
| 987 | int singlethread, | 2739 | max_active = wq_clamp_max_active(max_active, flags, name); |
| 988 | int freezeable, | ||
| 989 | int rt, | ||
| 990 | struct lock_class_key *key, | ||
| 991 | const char *lock_name) | ||
| 992 | { | ||
| 993 | struct workqueue_struct *wq; | ||
| 994 | struct cpu_workqueue_struct *cwq; | ||
| 995 | int err = 0, cpu; | ||
| 996 | 2740 | ||
| 997 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); | 2741 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); |
| 998 | if (!wq) | 2742 | if (!wq) |
| 999 | return NULL; | 2743 | goto err; |
| 1000 | 2744 | ||
| 1001 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 2745 | wq->flags = flags; |
| 1002 | if (!wq->cpu_wq) { | 2746 | wq->saved_max_active = max_active; |
| 1003 | kfree(wq); | 2747 | mutex_init(&wq->flush_mutex); |
| 1004 | return NULL; | 2748 | atomic_set(&wq->nr_cwqs_to_flush, 0); |
| 1005 | } | 2749 | INIT_LIST_HEAD(&wq->flusher_queue); |
| 2750 | INIT_LIST_HEAD(&wq->flusher_overflow); | ||
| 1006 | 2751 | ||
| 1007 | wq->name = name; | 2752 | wq->name = name; |
| 1008 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 2753 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
| 1009 | wq->singlethread = singlethread; | ||
| 1010 | wq->freezeable = freezeable; | ||
| 1011 | wq->rt = rt; | ||
| 1012 | INIT_LIST_HEAD(&wq->list); | 2754 | INIT_LIST_HEAD(&wq->list); |
| 1013 | 2755 | ||
| 1014 | if (singlethread) { | 2756 | if (alloc_cwqs(wq) < 0) |
| 1015 | cwq = init_cpu_workqueue(wq, singlethread_cpu); | 2757 | goto err; |
| 1016 | err = create_workqueue_thread(cwq, singlethread_cpu); | 2758 | |
| 1017 | start_workqueue_thread(cwq, -1); | 2759 | for_each_cwq_cpu(cpu, wq) { |
| 1018 | } else { | 2760 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 1019 | cpu_maps_update_begin(); | 2761 | struct global_cwq *gcwq = get_gcwq(cpu); |
| 1020 | /* | 2762 | |
| 1021 | * We must place this wq on list even if the code below fails. | 2763 | BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); |
| 1022 | * cpu_down(cpu) can remove cpu from cpu_populated_map before | 2764 | cwq->gcwq = gcwq; |
| 1023 | * destroy_workqueue() takes the lock, in that case we leak | 2765 | cwq->wq = wq; |
| 1024 | * cwq[cpu]->thread. | 2766 | cwq->flush_color = -1; |
| 1025 | */ | 2767 | cwq->max_active = max_active; |
| 1026 | spin_lock(&workqueue_lock); | 2768 | INIT_LIST_HEAD(&cwq->delayed_works); |
| 1027 | list_add(&wq->list, &workqueues); | ||
| 1028 | spin_unlock(&workqueue_lock); | ||
| 1029 | /* | ||
| 1030 | * We must initialize cwqs for each possible cpu even if we | ||
| 1031 | * are going to call destroy_workqueue() finally. Otherwise | ||
| 1032 | * cpu_up() can hit the uninitialized cwq once we drop the | ||
| 1033 | * lock. | ||
| 1034 | */ | ||
| 1035 | for_each_possible_cpu(cpu) { | ||
| 1036 | cwq = init_cpu_workqueue(wq, cpu); | ||
| 1037 | if (err || !cpu_online(cpu)) | ||
| 1038 | continue; | ||
| 1039 | err = create_workqueue_thread(cwq, cpu); | ||
| 1040 | start_workqueue_thread(cwq, cpu); | ||
| 1041 | } | ||
| 1042 | cpu_maps_update_done(); | ||
| 1043 | } | 2769 | } |
| 1044 | 2770 | ||
| 1045 | if (err) { | 2771 | if (flags & WQ_RESCUER) { |
| 1046 | destroy_workqueue(wq); | 2772 | struct worker *rescuer; |
| 1047 | wq = NULL; | 2773 | |
| 2774 | if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) | ||
| 2775 | goto err; | ||
| 2776 | |||
| 2777 | wq->rescuer = rescuer = alloc_worker(); | ||
| 2778 | if (!rescuer) | ||
| 2779 | goto err; | ||
| 2780 | |||
| 2781 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); | ||
| 2782 | if (IS_ERR(rescuer->task)) | ||
| 2783 | goto err; | ||
| 2784 | |||
| 2785 | wq->rescuer = rescuer; | ||
| 2786 | rescuer->task->flags |= PF_THREAD_BOUND; | ||
| 2787 | wake_up_process(rescuer->task); | ||
| 1048 | } | 2788 | } |
| 1049 | return wq; | ||
| 1050 | } | ||
| 1051 | EXPORT_SYMBOL_GPL(__create_workqueue_key); | ||
| 1052 | 2789 | ||
| 1053 | static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | ||
| 1054 | { | ||
| 1055 | /* | 2790 | /* |
| 1056 | * Our caller is either destroy_workqueue() or CPU_POST_DEAD, | 2791 | * workqueue_lock protects global freeze state and workqueues |
| 1057 | * cpu_add_remove_lock protects cwq->thread. | 2792 | * list. Grab it, set max_active accordingly and add the new |
| 2793 | * workqueue to workqueues list. | ||
| 1058 | */ | 2794 | */ |
| 1059 | if (cwq->thread == NULL) | 2795 | spin_lock(&workqueue_lock); |
| 1060 | return; | ||
| 1061 | 2796 | ||
| 1062 | lock_map_acquire(&cwq->wq->lockdep_map); | 2797 | if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) |
| 1063 | lock_map_release(&cwq->wq->lockdep_map); | 2798 | for_each_cwq_cpu(cpu, wq) |
| 2799 | get_cwq(cpu, wq)->max_active = 0; | ||
| 1064 | 2800 | ||
| 1065 | flush_cpu_workqueue(cwq); | 2801 | list_add(&wq->list, &workqueues); |
| 1066 | /* | 2802 | |
| 1067 | * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, | 2803 | spin_unlock(&workqueue_lock); |
| 1068 | * a concurrent flush_workqueue() can insert a barrier after us. | 2804 | |
| 1069 | * However, in that case run_workqueue() won't return and check | 2805 | return wq; |
| 1070 | * kthread_should_stop() until it flushes all work_struct's. | 2806 | err: |
| 1071 | * When ->worklist becomes empty it is safe to exit because no | 2807 | if (wq) { |
| 1072 | * more work_structs can be queued on this cwq: flush_workqueue | 2808 | free_cwqs(wq); |
| 1073 | * checks list_empty(), and a "normal" queue_work() can't use | 2809 | free_mayday_mask(wq->mayday_mask); |
| 1074 | * a dead CPU. | 2810 | kfree(wq->rescuer); |
| 1075 | */ | 2811 | kfree(wq); |
| 1076 | trace_workqueue_destruction(cwq->thread); | 2812 | } |
| 1077 | kthread_stop(cwq->thread); | 2813 | return NULL; |
| 1078 | cwq->thread = NULL; | ||
| 1079 | } | 2814 | } |
| 2815 | EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | ||
| 1080 | 2816 | ||
| 1081 | /** | 2817 | /** |
| 1082 | * destroy_workqueue - safely terminate a workqueue | 2818 | * destroy_workqueue - safely terminate a workqueue |
| @@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) | |||
| 1086 | */ | 2822 | */ |
| 1087 | void destroy_workqueue(struct workqueue_struct *wq) | 2823 | void destroy_workqueue(struct workqueue_struct *wq) |
| 1088 | { | 2824 | { |
| 1089 | const struct cpumask *cpu_map = wq_cpu_map(wq); | 2825 | unsigned int cpu; |
| 1090 | int cpu; | ||
| 1091 | 2826 | ||
| 1092 | cpu_maps_update_begin(); | 2827 | flush_workqueue(wq); |
| 2828 | |||
| 2829 | /* | ||
| 2830 | * wq list is used to freeze wq, remove from list after | ||
| 2831 | * flushing is complete in case freeze races us. | ||
| 2832 | */ | ||
| 1093 | spin_lock(&workqueue_lock); | 2833 | spin_lock(&workqueue_lock); |
| 1094 | list_del(&wq->list); | 2834 | list_del(&wq->list); |
| 1095 | spin_unlock(&workqueue_lock); | 2835 | spin_unlock(&workqueue_lock); |
| 1096 | 2836 | ||
| 1097 | for_each_cpu(cpu, cpu_map) | 2837 | /* sanity check */ |
| 1098 | cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); | 2838 | for_each_cwq_cpu(cpu, wq) { |
| 1099 | cpu_maps_update_done(); | 2839 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); |
| 2840 | int i; | ||
| 2841 | |||
| 2842 | for (i = 0; i < WORK_NR_COLORS; i++) | ||
| 2843 | BUG_ON(cwq->nr_in_flight[i]); | ||
| 2844 | BUG_ON(cwq->nr_active); | ||
| 2845 | BUG_ON(!list_empty(&cwq->delayed_works)); | ||
| 2846 | } | ||
| 2847 | |||
| 2848 | if (wq->flags & WQ_RESCUER) { | ||
| 2849 | kthread_stop(wq->rescuer->task); | ||
| 2850 | free_mayday_mask(wq->mayday_mask); | ||
| 2851 | } | ||
| 1100 | 2852 | ||
| 1101 | free_percpu(wq->cpu_wq); | 2853 | free_cwqs(wq); |
| 1102 | kfree(wq); | 2854 | kfree(wq); |
| 1103 | } | 2855 | } |
| 1104 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 2856 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
| 1105 | 2857 | ||
| 2858 | /** | ||
| 2859 | * workqueue_set_max_active - adjust max_active of a workqueue | ||
| 2860 | * @wq: target workqueue | ||
| 2861 | * @max_active: new max_active value. | ||
| 2862 | * | ||
| 2863 | * Set max_active of @wq to @max_active. | ||
| 2864 | * | ||
| 2865 | * CONTEXT: | ||
| 2866 | * Don't call from IRQ context. | ||
| 2867 | */ | ||
| 2868 | void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | ||
| 2869 | { | ||
| 2870 | unsigned int cpu; | ||
| 2871 | |||
| 2872 | max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); | ||
| 2873 | |||
| 2874 | spin_lock(&workqueue_lock); | ||
| 2875 | |||
| 2876 | wq->saved_max_active = max_active; | ||
| 2877 | |||
| 2878 | for_each_cwq_cpu(cpu, wq) { | ||
| 2879 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 2880 | |||
| 2881 | spin_lock_irq(&gcwq->lock); | ||
| 2882 | |||
| 2883 | if (!(wq->flags & WQ_FREEZEABLE) || | ||
| 2884 | !(gcwq->flags & GCWQ_FREEZING)) | ||
| 2885 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | ||
| 2886 | |||
| 2887 | spin_unlock_irq(&gcwq->lock); | ||
| 2888 | } | ||
| 2889 | |||
| 2890 | spin_unlock(&workqueue_lock); | ||
| 2891 | } | ||
| 2892 | EXPORT_SYMBOL_GPL(workqueue_set_max_active); | ||
| 2893 | |||
| 2894 | /** | ||
| 2895 | * workqueue_congested - test whether a workqueue is congested | ||
| 2896 | * @cpu: CPU in question | ||
| 2897 | * @wq: target workqueue | ||
| 2898 | * | ||
| 2899 | * Test whether @wq's cpu workqueue for @cpu is congested. There is | ||
| 2900 | * no synchronization around this function and the test result is | ||
| 2901 | * unreliable and only useful as advisory hints or for debugging. | ||
| 2902 | * | ||
| 2903 | * RETURNS: | ||
| 2904 | * %true if congested, %false otherwise. | ||
| 2905 | */ | ||
| 2906 | bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) | ||
| 2907 | { | ||
| 2908 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 2909 | |||
| 2910 | return !list_empty(&cwq->delayed_works); | ||
| 2911 | } | ||
| 2912 | EXPORT_SYMBOL_GPL(workqueue_congested); | ||
| 2913 | |||
| 2914 | /** | ||
| 2915 | * work_cpu - return the last known associated cpu for @work | ||
| 2916 | * @work: the work of interest | ||
| 2917 | * | ||
| 2918 | * RETURNS: | ||
| 2919 | * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. | ||
| 2920 | */ | ||
| 2921 | unsigned int work_cpu(struct work_struct *work) | ||
| 2922 | { | ||
| 2923 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 2924 | |||
| 2925 | return gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
| 2926 | } | ||
| 2927 | EXPORT_SYMBOL_GPL(work_cpu); | ||
| 2928 | |||
| 2929 | /** | ||
| 2930 | * work_busy - test whether a work is currently pending or running | ||
| 2931 | * @work: the work to be tested | ||
| 2932 | * | ||
| 2933 | * Test whether @work is currently pending or running. There is no | ||
| 2934 | * synchronization around this function and the test result is | ||
| 2935 | * unreliable and only useful as advisory hints or for debugging. | ||
| 2936 | * Especially for reentrant wqs, the pending state might hide the | ||
| 2937 | * running state. | ||
| 2938 | * | ||
| 2939 | * RETURNS: | ||
| 2940 | * OR'd bitmask of WORK_BUSY_* bits. | ||
| 2941 | */ | ||
| 2942 | unsigned int work_busy(struct work_struct *work) | ||
| 2943 | { | ||
| 2944 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 2945 | unsigned long flags; | ||
| 2946 | unsigned int ret = 0; | ||
| 2947 | |||
| 2948 | if (!gcwq) | ||
| 2949 | return false; | ||
| 2950 | |||
| 2951 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 2952 | |||
| 2953 | if (work_pending(work)) | ||
| 2954 | ret |= WORK_BUSY_PENDING; | ||
| 2955 | if (find_worker_executing_work(gcwq, work)) | ||
| 2956 | ret |= WORK_BUSY_RUNNING; | ||
| 2957 | |||
| 2958 | spin_unlock_irqrestore(&gcwq->lock, flags); | ||
| 2959 | |||
| 2960 | return ret; | ||
| 2961 | } | ||
| 2962 | EXPORT_SYMBOL_GPL(work_busy); | ||
| 2963 | |||
| 2964 | /* | ||
| 2965 | * CPU hotplug. | ||
| 2966 | * | ||
| 2967 | * There are two challenges in supporting CPU hotplug. Firstly, there | ||
| 2968 | * are a lot of assumptions on strong associations among work, cwq and | ||
| 2969 | * gcwq which make migrating pending and scheduled works very | ||
| 2970 | * difficult to implement without impacting hot paths. Secondly, | ||
| 2971 | * gcwqs serve mix of short, long and very long running works making | ||
| 2972 | * blocked draining impractical. | ||
| 2973 | * | ||
| 2974 | * This is solved by allowing a gcwq to be detached from CPU, running | ||
| 2975 | * it with unbound (rogue) workers and allowing it to be reattached | ||
| 2976 | * later if the cpu comes back online. A separate thread is created | ||
| 2977 | * to govern a gcwq in such state and is called the trustee of the | ||
| 2978 | * gcwq. | ||
| 2979 | * | ||
| 2980 | * Trustee states and their descriptions. | ||
| 2981 | * | ||
| 2982 | * START Command state used on startup. On CPU_DOWN_PREPARE, a | ||
| 2983 | * new trustee is started with this state. | ||
| 2984 | * | ||
| 2985 | * IN_CHARGE Once started, trustee will enter this state after | ||
| 2986 | * assuming the manager role and making all existing | ||
| 2987 | * workers rogue. DOWN_PREPARE waits for trustee to | ||
| 2988 | * enter this state. After reaching IN_CHARGE, trustee | ||
| 2989 | * tries to execute the pending worklist until it's empty | ||
| 2990 | * and the state is set to BUTCHER, or the state is set | ||
| 2991 | * to RELEASE. | ||
| 2992 | * | ||
| 2993 | * BUTCHER Command state which is set by the cpu callback after | ||
| 2994 | * the cpu has went down. Once this state is set trustee | ||
| 2995 | * knows that there will be no new works on the worklist | ||
| 2996 | * and once the worklist is empty it can proceed to | ||
| 2997 | * killing idle workers. | ||
| 2998 | * | ||
| 2999 | * RELEASE Command state which is set by the cpu callback if the | ||
| 3000 | * cpu down has been canceled or it has come online | ||
| 3001 | * again. After recognizing this state, trustee stops | ||
| 3002 | * trying to drain or butcher and clears ROGUE, rebinds | ||
| 3003 | * all remaining workers back to the cpu and releases | ||
| 3004 | * manager role. | ||
| 3005 | * | ||
| 3006 | * DONE Trustee will enter this state after BUTCHER or RELEASE | ||
| 3007 | * is complete. | ||
| 3008 | * | ||
| 3009 | * trustee CPU draining | ||
| 3010 | * took over down complete | ||
| 3011 | * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE | ||
| 3012 | * | | ^ | ||
| 3013 | * | CPU is back online v return workers | | ||
| 3014 | * ----------------> RELEASE -------------- | ||
| 3015 | */ | ||
| 3016 | |||
| 3017 | /** | ||
| 3018 | * trustee_wait_event_timeout - timed event wait for trustee | ||
| 3019 | * @cond: condition to wait for | ||
| 3020 | * @timeout: timeout in jiffies | ||
| 3021 | * | ||
| 3022 | * wait_event_timeout() for trustee to use. Handles locking and | ||
| 3023 | * checks for RELEASE request. | ||
| 3024 | * | ||
| 3025 | * CONTEXT: | ||
| 3026 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 3027 | * multiple times. To be used by trustee. | ||
| 3028 | * | ||
| 3029 | * RETURNS: | ||
| 3030 | * Positive indicating left time if @cond is satisfied, 0 if timed | ||
| 3031 | * out, -1 if canceled. | ||
| 3032 | */ | ||
| 3033 | #define trustee_wait_event_timeout(cond, timeout) ({ \ | ||
| 3034 | long __ret = (timeout); \ | ||
| 3035 | while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ | ||
| 3036 | __ret) { \ | ||
| 3037 | spin_unlock_irq(&gcwq->lock); \ | ||
| 3038 | __wait_event_timeout(gcwq->trustee_wait, (cond) || \ | ||
| 3039 | (gcwq->trustee_state == TRUSTEE_RELEASE), \ | ||
| 3040 | __ret); \ | ||
| 3041 | spin_lock_irq(&gcwq->lock); \ | ||
| 3042 | } \ | ||
| 3043 | gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ | ||
| 3044 | }) | ||
| 3045 | |||
| 3046 | /** | ||
| 3047 | * trustee_wait_event - event wait for trustee | ||
| 3048 | * @cond: condition to wait for | ||
| 3049 | * | ||
| 3050 | * wait_event() for trustee to use. Automatically handles locking and | ||
| 3051 | * checks for CANCEL request. | ||
| 3052 | * | ||
| 3053 | * CONTEXT: | ||
| 3054 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 3055 | * multiple times. To be used by trustee. | ||
| 3056 | * | ||
| 3057 | * RETURNS: | ||
| 3058 | * 0 if @cond is satisfied, -1 if canceled. | ||
| 3059 | */ | ||
| 3060 | #define trustee_wait_event(cond) ({ \ | ||
| 3061 | long __ret1; \ | ||
| 3062 | __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ | ||
| 3063 | __ret1 < 0 ? -1 : 0; \ | ||
| 3064 | }) | ||
| 3065 | |||
| 3066 | static int __cpuinit trustee_thread(void *__gcwq) | ||
| 3067 | { | ||
| 3068 | struct global_cwq *gcwq = __gcwq; | ||
| 3069 | struct worker *worker; | ||
| 3070 | struct work_struct *work; | ||
| 3071 | struct hlist_node *pos; | ||
| 3072 | long rc; | ||
| 3073 | int i; | ||
| 3074 | |||
| 3075 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
| 3076 | |||
| 3077 | spin_lock_irq(&gcwq->lock); | ||
| 3078 | /* | ||
| 3079 | * Claim the manager position and make all workers rogue. | ||
| 3080 | * Trustee must be bound to the target cpu and can't be | ||
| 3081 | * cancelled. | ||
| 3082 | */ | ||
| 3083 | BUG_ON(gcwq->cpu != smp_processor_id()); | ||
| 3084 | rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); | ||
| 3085 | BUG_ON(rc < 0); | ||
| 3086 | |||
| 3087 | gcwq->flags |= GCWQ_MANAGING_WORKERS; | ||
| 3088 | |||
| 3089 | list_for_each_entry(worker, &gcwq->idle_list, entry) | ||
| 3090 | worker->flags |= WORKER_ROGUE; | ||
| 3091 | |||
| 3092 | for_each_busy_worker(worker, i, pos, gcwq) | ||
| 3093 | worker->flags |= WORKER_ROGUE; | ||
| 3094 | |||
| 3095 | /* | ||
| 3096 | * Call schedule() so that we cross rq->lock and thus can | ||
| 3097 | * guarantee sched callbacks see the rogue flag. This is | ||
| 3098 | * necessary as scheduler callbacks may be invoked from other | ||
| 3099 | * cpus. | ||
| 3100 | */ | ||
| 3101 | spin_unlock_irq(&gcwq->lock); | ||
| 3102 | schedule(); | ||
| 3103 | spin_lock_irq(&gcwq->lock); | ||
| 3104 | |||
| 3105 | /* | ||
| 3106 | * Sched callbacks are disabled now. Zap nr_running. After | ||
| 3107 | * this, nr_running stays zero and need_more_worker() and | ||
| 3108 | * keep_working() are always true as long as the worklist is | ||
| 3109 | * not empty. | ||
| 3110 | */ | ||
| 3111 | atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); | ||
| 3112 | |||
| 3113 | spin_unlock_irq(&gcwq->lock); | ||
| 3114 | del_timer_sync(&gcwq->idle_timer); | ||
| 3115 | spin_lock_irq(&gcwq->lock); | ||
| 3116 | |||
| 3117 | /* | ||
| 3118 | * We're now in charge. Notify and proceed to drain. We need | ||
| 3119 | * to keep the gcwq running during the whole CPU down | ||
| 3120 | * procedure as other cpu hotunplug callbacks may need to | ||
| 3121 | * flush currently running tasks. | ||
| 3122 | */ | ||
| 3123 | gcwq->trustee_state = TRUSTEE_IN_CHARGE; | ||
| 3124 | wake_up_all(&gcwq->trustee_wait); | ||
| 3125 | |||
| 3126 | /* | ||
| 3127 | * The original cpu is in the process of dying and may go away | ||
| 3128 | * anytime now. When that happens, we and all workers would | ||
| 3129 | * be migrated to other cpus. Try draining any left work. We | ||
| 3130 | * want to get it over with ASAP - spam rescuers, wake up as | ||
| 3131 | * many idlers as necessary and create new ones till the | ||
| 3132 | * worklist is empty. Note that if the gcwq is frozen, there | ||
| 3133 | * may be frozen works in freezeable cwqs. Don't declare | ||
| 3134 | * completion while frozen. | ||
| 3135 | */ | ||
| 3136 | while (gcwq->nr_workers != gcwq->nr_idle || | ||
| 3137 | gcwq->flags & GCWQ_FREEZING || | ||
| 3138 | gcwq->trustee_state == TRUSTEE_IN_CHARGE) { | ||
| 3139 | int nr_works = 0; | ||
| 3140 | |||
| 3141 | list_for_each_entry(work, &gcwq->worklist, entry) { | ||
| 3142 | send_mayday(work); | ||
| 3143 | nr_works++; | ||
| 3144 | } | ||
| 3145 | |||
| 3146 | list_for_each_entry(worker, &gcwq->idle_list, entry) { | ||
| 3147 | if (!nr_works--) | ||
| 3148 | break; | ||
| 3149 | wake_up_process(worker->task); | ||
| 3150 | } | ||
| 3151 | |||
| 3152 | if (need_to_create_worker(gcwq)) { | ||
| 3153 | spin_unlock_irq(&gcwq->lock); | ||
| 3154 | worker = create_worker(gcwq, false); | ||
| 3155 | spin_lock_irq(&gcwq->lock); | ||
| 3156 | if (worker) { | ||
| 3157 | worker->flags |= WORKER_ROGUE; | ||
| 3158 | start_worker(worker); | ||
| 3159 | } | ||
| 3160 | } | ||
| 3161 | |||
| 3162 | /* give a breather */ | ||
| 3163 | if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) | ||
| 3164 | break; | ||
| 3165 | } | ||
| 3166 | |||
| 3167 | /* | ||
| 3168 | * Either all works have been scheduled and cpu is down, or | ||
| 3169 | * cpu down has already been canceled. Wait for and butcher | ||
| 3170 | * all workers till we're canceled. | ||
| 3171 | */ | ||
| 3172 | do { | ||
| 3173 | rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); | ||
| 3174 | while (!list_empty(&gcwq->idle_list)) | ||
| 3175 | destroy_worker(list_first_entry(&gcwq->idle_list, | ||
| 3176 | struct worker, entry)); | ||
| 3177 | } while (gcwq->nr_workers && rc >= 0); | ||
| 3178 | |||
| 3179 | /* | ||
| 3180 | * At this point, either draining has completed and no worker | ||
| 3181 | * is left, or cpu down has been canceled or the cpu is being | ||
| 3182 | * brought back up. There shouldn't be any idle one left. | ||
| 3183 | * Tell the remaining busy ones to rebind once it finishes the | ||
| 3184 | * currently scheduled works by scheduling the rebind_work. | ||
| 3185 | */ | ||
| 3186 | WARN_ON(!list_empty(&gcwq->idle_list)); | ||
| 3187 | |||
| 3188 | for_each_busy_worker(worker, i, pos, gcwq) { | ||
| 3189 | struct work_struct *rebind_work = &worker->rebind_work; | ||
| 3190 | |||
| 3191 | /* | ||
| 3192 | * Rebind_work may race with future cpu hotplug | ||
| 3193 | * operations. Use a separate flag to mark that | ||
| 3194 | * rebinding is scheduled. | ||
| 3195 | */ | ||
| 3196 | worker->flags |= WORKER_REBIND; | ||
| 3197 | worker->flags &= ~WORKER_ROGUE; | ||
| 3198 | |||
| 3199 | /* queue rebind_work, wq doesn't matter, use the default one */ | ||
| 3200 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | ||
| 3201 | work_data_bits(rebind_work))) | ||
| 3202 | continue; | ||
| 3203 | |||
| 3204 | debug_work_activate(rebind_work); | ||
| 3205 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
| 3206 | worker->scheduled.next, | ||
| 3207 | work_color_to_flags(WORK_NO_COLOR)); | ||
| 3208 | } | ||
| 3209 | |||
| 3210 | /* relinquish manager role */ | ||
| 3211 | gcwq->flags &= ~GCWQ_MANAGING_WORKERS; | ||
| 3212 | |||
| 3213 | /* notify completion */ | ||
| 3214 | gcwq->trustee = NULL; | ||
| 3215 | gcwq->trustee_state = TRUSTEE_DONE; | ||
| 3216 | wake_up_all(&gcwq->trustee_wait); | ||
| 3217 | spin_unlock_irq(&gcwq->lock); | ||
| 3218 | return 0; | ||
| 3219 | } | ||
| 3220 | |||
| 3221 | /** | ||
| 3222 | * wait_trustee_state - wait for trustee to enter the specified state | ||
| 3223 | * @gcwq: gcwq the trustee of interest belongs to | ||
| 3224 | * @state: target state to wait for | ||
| 3225 | * | ||
| 3226 | * Wait for the trustee to reach @state. DONE is already matched. | ||
| 3227 | * | ||
| 3228 | * CONTEXT: | ||
| 3229 | * spin_lock_irq(gcwq->lock) which may be released and regrabbed | ||
| 3230 | * multiple times. To be used by cpu_callback. | ||
| 3231 | */ | ||
| 3232 | static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) | ||
| 3233 | { | ||
| 3234 | if (!(gcwq->trustee_state == state || | ||
| 3235 | gcwq->trustee_state == TRUSTEE_DONE)) { | ||
| 3236 | spin_unlock_irq(&gcwq->lock); | ||
| 3237 | __wait_event(gcwq->trustee_wait, | ||
| 3238 | gcwq->trustee_state == state || | ||
| 3239 | gcwq->trustee_state == TRUSTEE_DONE); | ||
| 3240 | spin_lock_irq(&gcwq->lock); | ||
| 3241 | } | ||
| 3242 | } | ||
| 3243 | |||
| 1106 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | 3244 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
| 1107 | unsigned long action, | 3245 | unsigned long action, |
| 1108 | void *hcpu) | 3246 | void *hcpu) |
| 1109 | { | 3247 | { |
| 1110 | unsigned int cpu = (unsigned long)hcpu; | 3248 | unsigned int cpu = (unsigned long)hcpu; |
| 1111 | struct cpu_workqueue_struct *cwq; | 3249 | struct global_cwq *gcwq = get_gcwq(cpu); |
| 1112 | struct workqueue_struct *wq; | 3250 | struct task_struct *new_trustee = NULL; |
| 1113 | int err = 0; | 3251 | struct worker *uninitialized_var(new_worker); |
| 3252 | unsigned long flags; | ||
| 1114 | 3253 | ||
| 1115 | action &= ~CPU_TASKS_FROZEN; | 3254 | action &= ~CPU_TASKS_FROZEN; |
| 1116 | 3255 | ||
| 1117 | switch (action) { | 3256 | switch (action) { |
| 3257 | case CPU_DOWN_PREPARE: | ||
| 3258 | new_trustee = kthread_create(trustee_thread, gcwq, | ||
| 3259 | "workqueue_trustee/%d\n", cpu); | ||
| 3260 | if (IS_ERR(new_trustee)) | ||
| 3261 | return notifier_from_errno(PTR_ERR(new_trustee)); | ||
| 3262 | kthread_bind(new_trustee, cpu); | ||
| 3263 | /* fall through */ | ||
| 1118 | case CPU_UP_PREPARE: | 3264 | case CPU_UP_PREPARE: |
| 1119 | cpumask_set_cpu(cpu, cpu_populated_map); | 3265 | BUG_ON(gcwq->first_idle); |
| 1120 | } | 3266 | new_worker = create_worker(gcwq, false); |
| 1121 | undo: | 3267 | if (!new_worker) { |
| 1122 | list_for_each_entry(wq, &workqueues, list) { | 3268 | if (new_trustee) |
| 1123 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 3269 | kthread_stop(new_trustee); |
| 1124 | 3270 | return NOTIFY_BAD; | |
| 1125 | switch (action) { | ||
| 1126 | case CPU_UP_PREPARE: | ||
| 1127 | err = create_workqueue_thread(cwq, cpu); | ||
| 1128 | if (!err) | ||
| 1129 | break; | ||
| 1130 | printk(KERN_ERR "workqueue [%s] for %i failed\n", | ||
| 1131 | wq->name, cpu); | ||
| 1132 | action = CPU_UP_CANCELED; | ||
| 1133 | err = -ENOMEM; | ||
| 1134 | goto undo; | ||
| 1135 | |||
| 1136 | case CPU_ONLINE: | ||
| 1137 | start_workqueue_thread(cwq, cpu); | ||
| 1138 | break; | ||
| 1139 | |||
| 1140 | case CPU_UP_CANCELED: | ||
| 1141 | start_workqueue_thread(cwq, -1); | ||
| 1142 | case CPU_POST_DEAD: | ||
| 1143 | cleanup_workqueue_thread(cwq); | ||
| 1144 | break; | ||
| 1145 | } | 3271 | } |
| 1146 | } | 3272 | } |
| 1147 | 3273 | ||
| 3274 | /* some are called w/ irq disabled, don't disturb irq status */ | ||
| 3275 | spin_lock_irqsave(&gcwq->lock, flags); | ||
| 3276 | |||
| 1148 | switch (action) { | 3277 | switch (action) { |
| 1149 | case CPU_UP_CANCELED: | 3278 | case CPU_DOWN_PREPARE: |
| 3279 | /* initialize trustee and tell it to acquire the gcwq */ | ||
| 3280 | BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); | ||
| 3281 | gcwq->trustee = new_trustee; | ||
| 3282 | gcwq->trustee_state = TRUSTEE_START; | ||
| 3283 | wake_up_process(gcwq->trustee); | ||
| 3284 | wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); | ||
| 3285 | /* fall through */ | ||
| 3286 | case CPU_UP_PREPARE: | ||
| 3287 | BUG_ON(gcwq->first_idle); | ||
| 3288 | gcwq->first_idle = new_worker; | ||
| 3289 | break; | ||
| 3290 | |||
| 3291 | case CPU_DYING: | ||
| 3292 | /* | ||
| 3293 | * Before this, the trustee and all workers except for | ||
| 3294 | * the ones which are still executing works from | ||
| 3295 | * before the last CPU down must be on the cpu. After | ||
| 3296 | * this, they'll all be diasporas. | ||
| 3297 | */ | ||
| 3298 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
| 3299 | break; | ||
| 3300 | |||
| 1150 | case CPU_POST_DEAD: | 3301 | case CPU_POST_DEAD: |
| 1151 | cpumask_clear_cpu(cpu, cpu_populated_map); | 3302 | gcwq->trustee_state = TRUSTEE_BUTCHER; |
| 3303 | /* fall through */ | ||
| 3304 | case CPU_UP_CANCELED: | ||
| 3305 | destroy_worker(gcwq->first_idle); | ||
| 3306 | gcwq->first_idle = NULL; | ||
| 3307 | break; | ||
| 3308 | |||
| 3309 | case CPU_DOWN_FAILED: | ||
| 3310 | case CPU_ONLINE: | ||
| 3311 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | ||
| 3312 | if (gcwq->trustee_state != TRUSTEE_DONE) { | ||
| 3313 | gcwq->trustee_state = TRUSTEE_RELEASE; | ||
| 3314 | wake_up_process(gcwq->trustee); | ||
| 3315 | wait_trustee_state(gcwq, TRUSTEE_DONE); | ||
| 3316 | } | ||
| 3317 | |||
| 3318 | /* | ||
| 3319 | * Trustee is done and there might be no worker left. | ||
| 3320 | * Put the first_idle in and request a real manager to | ||
| 3321 | * take a look. | ||
| 3322 | */ | ||
| 3323 | spin_unlock_irq(&gcwq->lock); | ||
| 3324 | kthread_bind(gcwq->first_idle->task, cpu); | ||
| 3325 | spin_lock_irq(&gcwq->lock); | ||
| 3326 | gcwq->flags |= GCWQ_MANAGE_WORKERS; | ||
| 3327 | start_worker(gcwq->first_idle); | ||
| 3328 | gcwq->first_idle = NULL; | ||
| 3329 | break; | ||
| 1152 | } | 3330 | } |
| 1153 | 3331 | ||
| 1154 | return notifier_from_errno(err); | 3332 | spin_unlock_irqrestore(&gcwq->lock, flags); |
| 3333 | |||
| 3334 | return notifier_from_errno(0); | ||
| 1155 | } | 3335 | } |
| 1156 | 3336 | ||
| 1157 | #ifdef CONFIG_SMP | 3337 | #ifdef CONFIG_SMP |
| @@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) | |||
| 1201 | EXPORT_SYMBOL_GPL(work_on_cpu); | 3381 | EXPORT_SYMBOL_GPL(work_on_cpu); |
| 1202 | #endif /* CONFIG_SMP */ | 3382 | #endif /* CONFIG_SMP */ |
| 1203 | 3383 | ||
| 1204 | void __init init_workqueues(void) | 3384 | #ifdef CONFIG_FREEZER |
| 3385 | |||
| 3386 | /** | ||
| 3387 | * freeze_workqueues_begin - begin freezing workqueues | ||
| 3388 | * | ||
| 3389 | * Start freezing workqueues. After this function returns, all | ||
| 3390 | * freezeable workqueues will queue new works to their frozen_works | ||
| 3391 | * list instead of gcwq->worklist. | ||
| 3392 | * | ||
| 3393 | * CONTEXT: | ||
| 3394 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
| 3395 | */ | ||
| 3396 | void freeze_workqueues_begin(void) | ||
| 1205 | { | 3397 | { |
| 1206 | alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); | 3398 | unsigned int cpu; |
| 1207 | 3399 | ||
| 1208 | cpumask_copy(cpu_populated_map, cpu_online_mask); | 3400 | spin_lock(&workqueue_lock); |
| 1209 | singlethread_cpu = cpumask_first(cpu_possible_mask); | 3401 | |
| 1210 | cpu_singlethread_map = cpumask_of(singlethread_cpu); | 3402 | BUG_ON(workqueue_freezing); |
| 1211 | hotcpu_notifier(workqueue_cpu_callback, 0); | 3403 | workqueue_freezing = true; |
| 1212 | keventd_wq = create_workqueue("events"); | 3404 | |
| 1213 | BUG_ON(!keventd_wq); | 3405 | for_each_gcwq_cpu(cpu) { |
| 3406 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3407 | struct workqueue_struct *wq; | ||
| 3408 | |||
| 3409 | spin_lock_irq(&gcwq->lock); | ||
| 3410 | |||
| 3411 | BUG_ON(gcwq->flags & GCWQ_FREEZING); | ||
| 3412 | gcwq->flags |= GCWQ_FREEZING; | ||
| 3413 | |||
| 3414 | list_for_each_entry(wq, &workqueues, list) { | ||
| 3415 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 3416 | |||
| 3417 | if (cwq && wq->flags & WQ_FREEZEABLE) | ||
| 3418 | cwq->max_active = 0; | ||
| 3419 | } | ||
| 3420 | |||
| 3421 | spin_unlock_irq(&gcwq->lock); | ||
| 3422 | } | ||
| 3423 | |||
| 3424 | spin_unlock(&workqueue_lock); | ||
| 3425 | } | ||
| 3426 | |||
| 3427 | /** | ||
| 3428 | * freeze_workqueues_busy - are freezeable workqueues still busy? | ||
| 3429 | * | ||
| 3430 | * Check whether freezing is complete. This function must be called | ||
| 3431 | * between freeze_workqueues_begin() and thaw_workqueues(). | ||
| 3432 | * | ||
| 3433 | * CONTEXT: | ||
| 3434 | * Grabs and releases workqueue_lock. | ||
| 3435 | * | ||
| 3436 | * RETURNS: | ||
| 3437 | * %true if some freezeable workqueues are still busy. %false if | ||
| 3438 | * freezing is complete. | ||
| 3439 | */ | ||
| 3440 | bool freeze_workqueues_busy(void) | ||
| 3441 | { | ||
| 3442 | unsigned int cpu; | ||
| 3443 | bool busy = false; | ||
| 3444 | |||
| 3445 | spin_lock(&workqueue_lock); | ||
| 3446 | |||
| 3447 | BUG_ON(!workqueue_freezing); | ||
| 3448 | |||
| 3449 | for_each_gcwq_cpu(cpu) { | ||
| 3450 | struct workqueue_struct *wq; | ||
| 3451 | /* | ||
| 3452 | * nr_active is monotonically decreasing. It's safe | ||
| 3453 | * to peek without lock. | ||
| 3454 | */ | ||
| 3455 | list_for_each_entry(wq, &workqueues, list) { | ||
| 3456 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 3457 | |||
| 3458 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
| 3459 | continue; | ||
| 3460 | |||
| 3461 | BUG_ON(cwq->nr_active < 0); | ||
| 3462 | if (cwq->nr_active) { | ||
| 3463 | busy = true; | ||
| 3464 | goto out_unlock; | ||
| 3465 | } | ||
| 3466 | } | ||
| 3467 | } | ||
| 3468 | out_unlock: | ||
| 3469 | spin_unlock(&workqueue_lock); | ||
| 3470 | return busy; | ||
| 3471 | } | ||
| 3472 | |||
| 3473 | /** | ||
| 3474 | * thaw_workqueues - thaw workqueues | ||
| 3475 | * | ||
| 3476 | * Thaw workqueues. Normal queueing is restored and all collected | ||
| 3477 | * frozen works are transferred to their respective gcwq worklists. | ||
| 3478 | * | ||
| 3479 | * CONTEXT: | ||
| 3480 | * Grabs and releases workqueue_lock and gcwq->lock's. | ||
| 3481 | */ | ||
| 3482 | void thaw_workqueues(void) | ||
| 3483 | { | ||
| 3484 | unsigned int cpu; | ||
| 3485 | |||
| 3486 | spin_lock(&workqueue_lock); | ||
| 3487 | |||
| 3488 | if (!workqueue_freezing) | ||
| 3489 | goto out_unlock; | ||
| 3490 | |||
| 3491 | for_each_gcwq_cpu(cpu) { | ||
| 3492 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3493 | struct workqueue_struct *wq; | ||
| 3494 | |||
| 3495 | spin_lock_irq(&gcwq->lock); | ||
| 3496 | |||
| 3497 | BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); | ||
| 3498 | gcwq->flags &= ~GCWQ_FREEZING; | ||
| 3499 | |||
| 3500 | list_for_each_entry(wq, &workqueues, list) { | ||
| 3501 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 3502 | |||
| 3503 | if (!cwq || !(wq->flags & WQ_FREEZEABLE)) | ||
| 3504 | continue; | ||
| 3505 | |||
| 3506 | /* restore max_active and repopulate worklist */ | ||
| 3507 | cwq->max_active = wq->saved_max_active; | ||
| 3508 | |||
| 3509 | while (!list_empty(&cwq->delayed_works) && | ||
| 3510 | cwq->nr_active < cwq->max_active) | ||
| 3511 | cwq_activate_first_delayed(cwq); | ||
| 3512 | } | ||
| 3513 | |||
| 3514 | wake_up_worker(gcwq); | ||
| 3515 | |||
| 3516 | spin_unlock_irq(&gcwq->lock); | ||
| 3517 | } | ||
| 3518 | |||
| 3519 | workqueue_freezing = false; | ||
| 3520 | out_unlock: | ||
| 3521 | spin_unlock(&workqueue_lock); | ||
| 3522 | } | ||
| 3523 | #endif /* CONFIG_FREEZER */ | ||
| 3524 | |||
| 3525 | static int __init init_workqueues(void) | ||
| 3526 | { | ||
| 3527 | unsigned int cpu; | ||
| 3528 | int i; | ||
| 3529 | |||
| 3530 | hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); | ||
| 3531 | |||
| 3532 | /* initialize gcwqs */ | ||
| 3533 | for_each_gcwq_cpu(cpu) { | ||
| 3534 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3535 | |||
| 3536 | spin_lock_init(&gcwq->lock); | ||
| 3537 | INIT_LIST_HEAD(&gcwq->worklist); | ||
| 3538 | gcwq->cpu = cpu; | ||
| 3539 | if (cpu == WORK_CPU_UNBOUND) | ||
| 3540 | gcwq->flags |= GCWQ_DISASSOCIATED; | ||
| 3541 | |||
| 3542 | INIT_LIST_HEAD(&gcwq->idle_list); | ||
| 3543 | for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) | ||
| 3544 | INIT_HLIST_HEAD(&gcwq->busy_hash[i]); | ||
| 3545 | |||
| 3546 | init_timer_deferrable(&gcwq->idle_timer); | ||
| 3547 | gcwq->idle_timer.function = idle_worker_timeout; | ||
| 3548 | gcwq->idle_timer.data = (unsigned long)gcwq; | ||
| 3549 | |||
| 3550 | setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, | ||
| 3551 | (unsigned long)gcwq); | ||
| 3552 | |||
| 3553 | ida_init(&gcwq->worker_ida); | ||
| 3554 | |||
| 3555 | gcwq->trustee_state = TRUSTEE_DONE; | ||
| 3556 | init_waitqueue_head(&gcwq->trustee_wait); | ||
| 3557 | } | ||
| 3558 | |||
| 3559 | /* create the initial worker */ | ||
| 3560 | for_each_online_gcwq_cpu(cpu) { | ||
| 3561 | struct global_cwq *gcwq = get_gcwq(cpu); | ||
| 3562 | struct worker *worker; | ||
| 3563 | |||
| 3564 | worker = create_worker(gcwq, true); | ||
| 3565 | BUG_ON(!worker); | ||
| 3566 | spin_lock_irq(&gcwq->lock); | ||
| 3567 | start_worker(worker); | ||
| 3568 | spin_unlock_irq(&gcwq->lock); | ||
| 3569 | } | ||
| 3570 | |||
| 3571 | system_wq = alloc_workqueue("events", 0, 0); | ||
| 3572 | system_long_wq = alloc_workqueue("events_long", 0, 0); | ||
| 3573 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | ||
| 3574 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | ||
| 3575 | WQ_UNBOUND_MAX_ACTIVE); | ||
| 3576 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); | ||
| 3577 | return 0; | ||
| 1214 | } | 3578 | } |
| 3579 | early_initcall(init_workqueues); | ||
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 000000000000..2d10fc98dc79 --- /dev/null +++ b/kernel/workqueue_sched.h | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | /* | ||
| 2 | * kernel/workqueue_sched.h | ||
| 3 | * | ||
| 4 | * Scheduler hooks for concurrency managed workqueue. Only to be | ||
| 5 | * included from sched.c and workqueue.c. | ||
| 6 | */ | ||
| 7 | void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); | ||
| 8 | struct task_struct *wq_worker_sleeping(struct task_struct *task, | ||
| 9 | unsigned int cpu); | ||
