aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2010-08-04 07:59:13 -0400
committerMichal Marek <mmarek@suse.cz>2010-08-04 07:59:13 -0400
commit772320e84588dcbe1600ffb83e5f328f2209ac2a (patch)
treea7de21b79340aeaa17c58126f6b801b82c77b53a /kernel
parent1ce53adf13a54375d2a5c7cdbe341b2558389615 (diff)
parent9fe6206f400646a2322096b56c59891d530e8d51 (diff)
Merge commit 'v2.6.35' into kbuild/kbuild
Conflicts: arch/powerpc/Makefile
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c47
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_tree.c101
-rw-r--r--kernel/audit_watch.c1
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/capability.c5
-rw-r--r--kernel/cgroup.c775
-rw-r--r--kernel/cgroup_freezer.c36
-rw-r--r--kernel/compat.c26
-rw-r--r--kernel/cpu.c172
-rw-r--r--kernel/cpuset.c251
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c89
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c983
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1014
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c564
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2846
-rw-r--r--kernel/debug/kdb/kdb_private.h300
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/early_res.c590
-rw-r--r--kernel/elfcore.c28
-rw-r--r--kernel/exec_domain.c18
-rw-r--r--kernel/exit.c63
-rw-r--r--kernel/fork.c138
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c69
-rw-r--r--kernel/hw_breakpoint.c259
-rw-r--r--kernel/irq/chip.c89
-rw-r--r--kernel/irq/devres.c4
-rw-r--r--kernel/irq/handle.c61
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/manage.c104
-rw-r--r--kernel/irq/numa_migrate.c5
-rw-r--r--kernel/irq/proc.c61
-rw-r--r--kernel/kallsyms.c22
-rw-r--r--kernel/kexec.c13
-rw-r--r--kernel/kfifo.c6
-rw-r--r--kernel/kgdb.c1760
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/kprobes.c804
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/latencytop.c1
-rw-r--r--kernel/lockdep.c143
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c537
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c14
-rw-r--r--kernel/padata.c774
-rw-r--r--kernel/panic.c73
-rw-r--r--kernel/params.c12
-rw-r--r--kernel/perf_event.c1788
-rw-r--r--kernel/pid.c13
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/posix-cpu-timers.c346
-rw-r--r--kernel/posix-timers.c13
-rw-r--r--kernel/power/Kconfig28
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/hibernate.c10
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)25
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c150
-rw-r--r--kernel/power/suspend.c10
-rw-r--r--kernel/power/swap.c338
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c62
-rw-r--r--kernel/printk.c80
-rw-r--r--kernel/profile.c12
-rw-r--r--kernel/ptrace.c126
-rw-r--r--kernel/range.c163
-rw-r--r--kernel/rcupdate.c51
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c106
-rw-r--r--kernel/rcutree.c389
-rw-r--r--kernel/rcutree.h84
-rw-r--r--kernel/rcutree_plugin.h296
-rw-r--r--kernel/rcutree_trace.c18
-rw-r--r--kernel/relay.c22
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/resource.c126
-rw-r--r--kernel/sched.c3093
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sched_cpupri.c7
-rw-r--r--kernel/sched_debug.c124
-rw-r--r--kernel/sched_fair.c2027
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c31
-rw-r--r--kernel/sched_rt.c77
-rw-r--r--kernel/signal.c108
-rw-r--r--kernel/slow-work.c2
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/smp.c11
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/softlockup.c19
-rw-r--r--kernel/srcu.c53
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c115
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c662
-rw-r--r--kernel/sysctl_binary.c18
-rw-r--r--kernel/taskstats.c7
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/clocksource.c84
-rw-r--r--kernel/time/ntp.c12
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/tick-sched.c87
-rw-r--r--kernel/time/timecompare.c1
-rw-r--r--kernel/time/timekeeping.c41
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c153
-rw-r--r--kernel/trace/Kconfig26
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/blktrace.c146
-rw-r--r--kernel/trace/ftrace.c172
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c258
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c408
-rw-r--r--kernel/trace/trace.h67
-rw-r--r--kernel/trace/trace_branch.c27
-rw-r--r--kernel/trace/trace_clock.c5
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_event_perf.c195
-rw-r--r--kernel/trace/trace_event_profile.c122
-rw-r--r--kernel/trace/trace_events.c219
-rw-r--r--kernel/trace/trace_events_filter.c35
-rw-r--r--kernel/trace/trace_export.c103
-rw-r--r--kernel/trace/trace_functions_graph.c284
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c947
-rw-r--r--kernel/trace/trace_ksym.c27
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c155
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c29
-rw-r--r--kernel/trace/trace_selftest.c65
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_syscalls.c374
-rw-r--r--kernel/trace/trace_workqueue.c27
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/tsacct.c1
-rw-r--r--kernel/user.c316
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/workqueue.c47
169 files changed, 22169 insertions, 10731 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o range.o
14obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
14obj-y += groups.o 15obj-y += groups.o
15 16
16ifdef CONFIG_FUNCTION_TRACER 17ifdef CONFIG_FUNCTION_TRACER
@@ -67,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
67obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
68obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
69obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
70obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_GCOV_KERNEL) += gcov/
75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
76obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
77obj-$(CONFIG_KGDB) += kgdb.o 78obj-$(CONFIG_KGDB) += debug/
78obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
79obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
90obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 92obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 93obj-$(CONFIG_LATENCYTOP) += latencytop.o
94obj-$(CONFIG_BINFMT_ELF) += elfcore.o
95obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
96obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
93obj-$(CONFIG_FUNCTION_TRACER) += trace/ 97obj-$(CONFIG_FUNCTION_TRACER) += trace/
94obj-$(CONFIG_TRACING) += trace/ 98obj-$(CONFIG_TRACING) += trace/
95obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
@@ -100,6 +104,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 104obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 105obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 106obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
107obj-$(CONFIG_PADATA) += padata.o
103 108
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 109ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 110# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
@@ -353,17 +343,18 @@ restart:
353 343
354void acct_exit_ns(struct pid_namespace *ns) 344void acct_exit_ns(struct pid_namespace *ns)
355{ 345{
356 struct bsd_acct_struct *acct; 346 struct bsd_acct_struct *acct = ns->bacct;
357 347
358 spin_lock(&acct_lock); 348 if (acct == NULL)
359 acct = ns->bacct; 349 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 350
364 kfree(acct); 351 del_timer_sync(&acct->timer);
365 } 352 spin_lock(&acct_lock);
353 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 355 spin_unlock(&acct_lock);
356
357 kfree(acct);
367} 358}
368 359
369/* 360/*
@@ -588,16 +579,6 @@ out:
588} 579}
589 580
590/** 581/**
591 * acct_init_pacct - initialize a new pacct_struct
592 * @pacct: per-process accounting info struct to initialize
593 */
594void acct_init_pacct(struct pacct_struct *pacct)
595{
596 memset(pacct, 0, sizeof(struct pacct_struct));
597 pacct->ac_utime = pacct->ac_stime = cputime_zero;
598}
599
600/**
601 * acct_collect - collect accounting information into pacct_struct 582 * acct_collect - collect accounting information into pacct_struct
602 * @exitcode: task exit code 583 * @exitcode: task exit code
603 * @group_dead: not 0, if this thread is the last one in the process. 584 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
56#include <linux/init.h> 56#include <linux/init.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/slab.h>
59#include <asm/atomic.h> 60#include <asm/atomic.h>
60 61
61static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h>
49#include <linux/err.h> 50#include <linux/err.h>
50#include <linux/kthread.h> 51#include <linux/kthread.h>
51 52
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
398 skb_get(skb); 399 skb_get(skb);
399 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); 400 err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
400 if (err < 0) { 401 if (err < 0) {
401 BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ 402 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
402 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); 403 printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
403 audit_log_lost("auditd dissapeared\n"); 404 audit_log_lost("auditd dissapeared\n");
404 audit_pid = 0; 405 audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/slab.h>
6 7
7struct audit_tree; 8struct audit_tree;
8struct audit_chunk; 9struct audit_chunk;
@@ -548,6 +549,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
548 return 0; 549 return 0;
549} 550}
550 551
552static int compare_root(struct vfsmount *mnt, void *arg)
553{
554 return mnt->mnt_root->d_inode == arg;
555}
556
551void audit_trim_trees(void) 557void audit_trim_trees(void)
552{ 558{
553 struct list_head cursor; 559 struct list_head cursor;
@@ -559,7 +565,6 @@ void audit_trim_trees(void)
559 struct path path; 565 struct path path;
560 struct vfsmount *root_mnt; 566 struct vfsmount *root_mnt;
561 struct node *node; 567 struct node *node;
562 struct list_head list;
563 int err; 568 int err;
564 569
565 tree = container_of(cursor.next, struct audit_tree, list); 570 tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +582,16 @@ void audit_trim_trees(void)
577 if (!root_mnt) 582 if (!root_mnt)
578 goto skip_it; 583 goto skip_it;
579 584
580 list_add_tail(&list, &root_mnt->mnt_list);
581 spin_lock(&hash_lock); 585 spin_lock(&hash_lock);
582 list_for_each_entry(node, &tree->chunks, list) { 586 list_for_each_entry(node, &tree->chunks, list) {
583 struct audit_chunk *chunk = find_chunk(node); 587 struct inode *inode = find_chunk(node)->watch.inode;
584 struct inode *inode = chunk->watch.inode;
585 struct vfsmount *mnt;
586 node->index |= 1U<<31; 588 node->index |= 1U<<31;
587 list_for_each_entry(mnt, &list, mnt_list) { 589 if (iterate_mounts(compare_root, inode, root_mnt))
588 if (mnt->mnt_root->d_inode == inode) { 590 node->index &= ~(1U<<31);
589 node->index &= ~(1U<<31);
590 break;
591 }
592 }
593 } 591 }
594 spin_unlock(&hash_lock); 592 spin_unlock(&hash_lock);
595 trim_marked(tree); 593 trim_marked(tree);
596 put_tree(tree); 594 put_tree(tree);
597 list_del_init(&list);
598 drop_collected_mounts(root_mnt); 595 drop_collected_mounts(root_mnt);
599skip_it: 596skip_it:
600 mutex_lock(&audit_filter_mutex); 597 mutex_lock(&audit_filter_mutex);
@@ -603,22 +600,6 @@ skip_it:
603 mutex_unlock(&audit_filter_mutex); 600 mutex_unlock(&audit_filter_mutex);
604} 601}
605 602
606static int is_under(struct vfsmount *mnt, struct dentry *dentry,
607 struct path *path)
608{
609 if (mnt != path->mnt) {
610 for (;;) {
611 if (mnt->mnt_parent == mnt)
612 return 0;
613 if (mnt->mnt_parent == path->mnt)
614 break;
615 mnt = mnt->mnt_parent;
616 }
617 dentry = mnt->mnt_mountpoint;
618 }
619 return is_subdir(dentry, path->dentry);
620}
621
622int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 603int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
623{ 604{
624 605
@@ -638,13 +619,17 @@ void audit_put_tree(struct audit_tree *tree)
638 put_tree(tree); 619 put_tree(tree);
639} 620}
640 621
622static int tag_mount(struct vfsmount *mnt, void *arg)
623{
624 return tag_chunk(mnt->mnt_root->d_inode, arg);
625}
626
641/* called with audit_filter_mutex */ 627/* called with audit_filter_mutex */
642int audit_add_tree_rule(struct audit_krule *rule) 628int audit_add_tree_rule(struct audit_krule *rule)
643{ 629{
644 struct audit_tree *seed = rule->tree, *tree; 630 struct audit_tree *seed = rule->tree, *tree;
645 struct path path; 631 struct path path;
646 struct vfsmount *mnt, *p; 632 struct vfsmount *mnt;
647 struct list_head list;
648 int err; 633 int err;
649 634
650 list_for_each_entry(tree, &tree_list, list) { 635 list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +655,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
670 err = -ENOMEM; 655 err = -ENOMEM;
671 goto Err; 656 goto Err;
672 } 657 }
673 list_add_tail(&list, &mnt->mnt_list);
674 658
675 get_tree(tree); 659 get_tree(tree);
676 list_for_each_entry(p, &list, mnt_list) { 660 err = iterate_mounts(tag_mount, tree, mnt);
677 err = tag_chunk(p->mnt_root->d_inode, tree);
678 if (err)
679 break;
680 }
681
682 list_del(&list);
683 drop_collected_mounts(mnt); 661 drop_collected_mounts(mnt);
684 662
685 if (!err) { 663 if (!err) {
@@ -714,31 +692,23 @@ int audit_tag_tree(char *old, char *new)
714{ 692{
715 struct list_head cursor, barrier; 693 struct list_head cursor, barrier;
716 int failed = 0; 694 int failed = 0;
717 struct path path; 695 struct path path1, path2;
718 struct vfsmount *tagged; 696 struct vfsmount *tagged;
719 struct list_head list;
720 struct vfsmount *mnt;
721 struct dentry *dentry;
722 int err; 697 int err;
723 698
724 err = kern_path(new, 0, &path); 699 err = kern_path(new, 0, &path2);
725 if (err) 700 if (err)
726 return err; 701 return err;
727 tagged = collect_mounts(&path); 702 tagged = collect_mounts(&path2);
728 path_put(&path); 703 path_put(&path2);
729 if (!tagged) 704 if (!tagged)
730 return -ENOMEM; 705 return -ENOMEM;
731 706
732 err = kern_path(old, 0, &path); 707 err = kern_path(old, 0, &path1);
733 if (err) { 708 if (err) {
734 drop_collected_mounts(tagged); 709 drop_collected_mounts(tagged);
735 return err; 710 return err;
736 } 711 }
737 mnt = mntget(path.mnt);
738 dentry = dget(path.dentry);
739 path_put(&path);
740
741 list_add_tail(&list, &tagged->mnt_list);
742 712
743 mutex_lock(&audit_filter_mutex); 713 mutex_lock(&audit_filter_mutex);
744 list_add(&barrier, &tree_list); 714 list_add(&barrier, &tree_list);
@@ -746,7 +716,7 @@ int audit_tag_tree(char *old, char *new)
746 716
747 while (cursor.next != &tree_list) { 717 while (cursor.next != &tree_list) {
748 struct audit_tree *tree; 718 struct audit_tree *tree;
749 struct vfsmount *p; 719 int good_one = 0;
750 720
751 tree = container_of(cursor.next, struct audit_tree, list); 721 tree = container_of(cursor.next, struct audit_tree, list);
752 get_tree(tree); 722 get_tree(tree);
@@ -754,30 +724,19 @@ int audit_tag_tree(char *old, char *new)
754 list_add(&cursor, &tree->list); 724 list_add(&cursor, &tree->list);
755 mutex_unlock(&audit_filter_mutex); 725 mutex_unlock(&audit_filter_mutex);
756 726
757 err = kern_path(tree->pathname, 0, &path); 727 err = kern_path(tree->pathname, 0, &path2);
758 if (err) { 728 if (!err) {
759 put_tree(tree); 729 good_one = path_is_under(&path1, &path2);
760 mutex_lock(&audit_filter_mutex); 730 path_put(&path2);
761 continue;
762 } 731 }
763 732
764 spin_lock(&vfsmount_lock); 733 if (!good_one) {
765 if (!is_under(mnt, dentry, &path)) {
766 spin_unlock(&vfsmount_lock);
767 path_put(&path);
768 put_tree(tree); 734 put_tree(tree);
769 mutex_lock(&audit_filter_mutex); 735 mutex_lock(&audit_filter_mutex);
770 continue; 736 continue;
771 } 737 }
772 spin_unlock(&vfsmount_lock);
773 path_put(&path);
774
775 list_for_each_entry(p, &list, mnt_list) {
776 failed = tag_chunk(p->mnt_root->d_inode, tree);
777 if (failed)
778 break;
779 }
780 738
739 failed = iterate_mounts(tag_mount, tree, tagged);
781 if (failed) { 740 if (failed) {
782 put_tree(tree); 741 put_tree(tree);
783 mutex_lock(&audit_filter_mutex); 742 mutex_lock(&audit_filter_mutex);
@@ -818,10 +777,8 @@ int audit_tag_tree(char *old, char *new)
818 } 777 }
819 list_del(&barrier); 778 list_del(&barrier);
820 list_del(&cursor); 779 list_del(&cursor);
821 list_del(&list);
822 mutex_unlock(&audit_filter_mutex); 780 mutex_unlock(&audit_filter_mutex);
823 dput(dentry); 781 path_put(&path1);
824 mntput(mnt);
825 drop_collected_mounts(tagged); 782 drop_collected_mounts(tagged);
826 return failed; 783 return failed;
827} 784}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/inotify.h> 31#include <linux/inotify.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include "audit.h" 33#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include "audit.h" 32#include "audit.h"
32 33
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/module.h>
52#include <linux/slab.h>
52#include <linux/mount.h> 53#include <linux/mount.h>
53#include <linux/socket.h> 54#include <linux/socket.h>
54#include <linux/mqueue.h> 55#include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
1893{ 1894{
1894 if (context->name_count >= AUDIT_NAMES) { 1895 if (context->name_count >= AUDIT_NAMES) {
1895 if (inode) 1896 if (inode)
1896 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1897 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1897 "dev=%02x:%02x, inode=%lu\n", 1898 "dev=%02x:%02x, inode=%lu\n",
1898 MAJOR(inode->i_sb->s_dev), 1899 MAJOR(inode->i_sb->s_dev),
1899 MINOR(inode->i_sb->s_dev), 1900 MINOR(inode->i_sb->s_dev),
@@ -1988,7 +1989,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1988 1989
1989/** 1990/**
1990 * audit_inode_child - collect inode info for created/removed objects 1991 * audit_inode_child - collect inode info for created/removed objects
1991 * @dname: inode's dentry name
1992 * @dentry: dentry being audited 1992 * @dentry: dentry being audited
1993 * @parent: inode of dentry parent 1993 * @parent: inode of dentry parent
1994 * 1994 *
@@ -2000,13 +2000,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2000 * must be hooked prior, in order to capture the target inode during 2000 * must be hooked prior, in order to capture the target inode during
2001 * unsuccessful attempts. 2001 * unsuccessful attempts.
2002 */ 2002 */
2003void __audit_inode_child(const char *dname, const struct dentry *dentry, 2003void __audit_inode_child(const struct dentry *dentry,
2004 const struct inode *parent) 2004 const struct inode *parent)
2005{ 2005{
2006 int idx; 2006 int idx;
2007 struct audit_context *context = current->audit_context; 2007 struct audit_context *context = current->audit_context;
2008 const char *found_parent = NULL, *found_child = NULL; 2008 const char *found_parent = NULL, *found_child = NULL;
2009 const struct inode *inode = dentry->d_inode; 2009 const struct inode *inode = dentry->d_inode;
2010 const char *dname = dentry->d_name.name;
2010 int dirlen = 0; 2011 int dirlen = 0;
2011 2012
2012 if (!context->in_syscall) 2013 if (!context->in_syscall)
@@ -2014,9 +2015,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
2014 2015
2015 if (inode) 2016 if (inode)
2016 handle_one(inode); 2017 handle_one(inode);
2017 /* determine matching parent */
2018 if (!dname)
2019 goto add_names;
2020 2018
2021 /* parent is more likely, look for it first */ 2019 /* parent is more likely, look for it first */
2022 for (idx = 0; idx < context->name_count; idx++) { 2020 for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
@@ -135,7 +134,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
135 if (pid && (pid != task_pid_vnr(current))) { 134 if (pid && (pid != task_pid_vnr(current))) {
136 struct task_struct *target; 135 struct task_struct *target;
137 136
138 read_lock(&tasklist_lock); 137 rcu_read_lock();
139 138
140 target = find_task_by_vpid(pid); 139 target = find_task_by_vpid(pid);
141 if (!target) 140 if (!target)
@@ -143,7 +142,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
143 else 142 else
144 ret = security_capget(target, pEp, pIp, pPp); 143 ret = security_capget(target, pEp, pIp, pPp);
145 144
146 read_unlock(&tasklist_lock); 145 rcu_read_unlock();
147 } else 146 } else
148 ret = security_capget(current, pEp, pIp, pPp); 147 ret = security_capget(current, pEp, pIp, pPp);
149 148
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc748044a..3ac6f5b0a64b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
4 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
5 * Copyright (C) 2006 Google, Inc 5 * Copyright (C) 2006 Google, Inc
6 * 6 *
7 * Notifications support
8 * Copyright (C) 2009 Nokia Corporation
9 * Author: Kirill A. Shutemov
10 *
7 * Copyright notices from the original cpuset code: 11 * Copyright notices from the original cpuset code:
8 * -------------------------------------------------- 12 * --------------------------------------------------
9 * Copyright (C) 2003 BULL SA. 13 * Copyright (C) 2003 BULL SA.
@@ -43,6 +47,7 @@
43#include <linux/string.h> 47#include <linux/string.h>
44#include <linux/sort.h> 48#include <linux/sort.h>
45#include <linux/kmod.h> 49#include <linux/kmod.h>
50#include <linux/module.h>
46#include <linux/delayacct.h> 51#include <linux/delayacct.h>
47#include <linux/cgroupstats.h> 52#include <linux/cgroupstats.h>
48#include <linux/hash.h> 53#include <linux/hash.h>
@@ -51,15 +56,21 @@
51#include <linux/pid_namespace.h> 56#include <linux/pid_namespace.h>
52#include <linux/idr.h> 57#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
59#include <linux/eventfd.h>
60#include <linux/poll.h>
54 61
55#include <asm/atomic.h> 62#include <asm/atomic.h>
56 63
57static DEFINE_MUTEX(cgroup_mutex); 64static DEFINE_MUTEX(cgroup_mutex);
58 65
59/* Generate an array of cgroup subsystem pointers */ 66/*
67 * Generate an array of cgroup subsystem pointers. At boot time, this is
68 * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
69 * registered after that. The mutable section of this array is protected by
70 * cgroup_mutex.
71 */
60#define SUBSYS(_x) &_x ## _subsys, 72#define SUBSYS(_x) &_x ## _subsys,
61 73static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
62static struct cgroup_subsys *subsys[] = {
63#include <linux/cgroup_subsys.h> 74#include <linux/cgroup_subsys.h>
64}; 75};
65 76
@@ -146,6 +157,35 @@ struct css_id {
146 unsigned short stack[0]; /* Array of Length (depth+1) */ 157 unsigned short stack[0]; /* Array of Length (depth+1) */
147}; 158};
148 159
160/*
161 * cgroup_event represents events which userspace want to recieve.
162 */
163struct cgroup_event {
164 /*
165 * Cgroup which the event belongs to.
166 */
167 struct cgroup *cgrp;
168 /*
169 * Control file which the event associated.
170 */
171 struct cftype *cft;
172 /*
173 * eventfd to signal userspace about the event.
174 */
175 struct eventfd_ctx *eventfd;
176 /*
177 * Each of these stored in a list by the cgroup.
178 */
179 struct list_head list;
180 /*
181 * All fields below needed to unregister event when
182 * userspace closes eventfd.
183 */
184 poll_table pt;
185 wait_queue_head_t *wqh;
186 wait_queue_t wait;
187 struct work_struct remove;
188};
149 189
150/* The list of hierarchy roots */ 190/* The list of hierarchy roots */
151 191
@@ -166,6 +206,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 206 */
167static int need_forkexit_callback __read_mostly; 207static int need_forkexit_callback __read_mostly;
168 208
209#ifdef CONFIG_PROVE_LOCKING
210int cgroup_lock_is_held(void)
211{
212 return lockdep_is_held(&cgroup_mutex);
213}
214#else /* #ifdef CONFIG_PROVE_LOCKING */
215int cgroup_lock_is_held(void)
216{
217 return mutex_is_locked(&cgroup_mutex);
218}
219#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
220
221EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
222
169/* convenient tests for these bits */ 223/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 224inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 225{
@@ -235,7 +289,8 @@ struct cg_cgroup_link {
235static struct css_set init_css_set; 289static struct css_set init_css_set;
236static struct cg_cgroup_link init_css_set_link; 290static struct cg_cgroup_link init_css_set_link;
237 291
238static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); 292static int cgroup_init_idr(struct cgroup_subsys *ss,
293 struct cgroup_subsys_state *css);
239 294
240/* css_set_lock protects the list of css_set objects, and the 295/* css_set_lock protects the list of css_set objects, and the
241 * chain of tasks off each css_set. Nests outside task->alloc_lock 296 * chain of tasks off each css_set. Nests outside task->alloc_lock
@@ -433,8 +488,11 @@ static struct css_set *find_existing_css_set(
433 struct hlist_node *node; 488 struct hlist_node *node;
434 struct css_set *cg; 489 struct css_set *cg;
435 490
436 /* Built the set of subsystem state objects that we want to 491 /*
437 * see in the new css_set */ 492 * Build the set of subsystem state objects that we want to see in the
493 * new css_set. while subsystems can change globally, the entries here
494 * won't change, so no need for locking.
495 */
438 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 496 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
439 if (root->subsys_bits & (1UL << i)) { 497 if (root->subsys_bits & (1UL << i)) {
440 /* Subsystem is in this hierarchy. So we want 498 /* Subsystem is in this hierarchy. So we want
@@ -681,6 +739,7 @@ void cgroup_lock(void)
681{ 739{
682 mutex_lock(&cgroup_mutex); 740 mutex_lock(&cgroup_mutex);
683} 741}
742EXPORT_SYMBOL_GPL(cgroup_lock);
684 743
685/** 744/**
686 * cgroup_unlock - release lock on cgroup changes 745 * cgroup_unlock - release lock on cgroup changes
@@ -691,6 +750,7 @@ void cgroup_unlock(void)
691{ 750{
692 mutex_unlock(&cgroup_mutex); 751 mutex_unlock(&cgroup_mutex);
693} 752}
753EXPORT_SYMBOL_GPL(cgroup_unlock);
694 754
695/* 755/*
696 * A couple of forward declarations required, due to cyclic reference loop: 756 * A couple of forward declarations required, due to cyclic reference loop:
@@ -742,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
742 if (ret) 802 if (ret)
743 break; 803 break;
744 } 804 }
805
745 return ret; 806 return ret;
746} 807}
747 808
@@ -869,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
869 css_put(css); 930 css_put(css);
870} 931}
871 932
872 933/*
934 * Call with cgroup_mutex held. Drops reference counts on modules, including
935 * any duplicate ones that parse_cgroupfs_options took. If this function
936 * returns an error, no reference counts are touched.
937 */
873static int rebind_subsystems(struct cgroupfs_root *root, 938static int rebind_subsystems(struct cgroupfs_root *root,
874 unsigned long final_bits) 939 unsigned long final_bits)
875{ 940{
@@ -877,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
877 struct cgroup *cgrp = &root->top_cgroup; 942 struct cgroup *cgrp = &root->top_cgroup;
878 int i; 943 int i;
879 944
945 BUG_ON(!mutex_is_locked(&cgroup_mutex));
946
880 removed_bits = root->actual_subsys_bits & ~final_bits; 947 removed_bits = root->actual_subsys_bits & ~final_bits;
881 added_bits = final_bits & ~root->actual_subsys_bits; 948 added_bits = final_bits & ~root->actual_subsys_bits;
882 /* Check that any added subsystems are currently free */ 949 /* Check that any added subsystems are currently free */
@@ -885,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
885 struct cgroup_subsys *ss = subsys[i]; 952 struct cgroup_subsys *ss = subsys[i];
886 if (!(bit & added_bits)) 953 if (!(bit & added_bits))
887 continue; 954 continue;
955 /*
956 * Nobody should tell us to do a subsys that doesn't exist:
957 * parse_cgroupfs_options should catch that case and refcounts
958 * ensure that subsystems won't disappear once selected.
959 */
960 BUG_ON(ss == NULL);
888 if (ss->root != &rootnode) { 961 if (ss->root != &rootnode) {
889 /* Subsystem isn't free */ 962 /* Subsystem isn't free */
890 return -EBUSY; 963 return -EBUSY;
@@ -904,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
904 unsigned long bit = 1UL << i; 977 unsigned long bit = 1UL << i;
905 if (bit & added_bits) { 978 if (bit & added_bits) {
906 /* We're binding this subsystem to this hierarchy */ 979 /* We're binding this subsystem to this hierarchy */
980 BUG_ON(ss == NULL);
907 BUG_ON(cgrp->subsys[i]); 981 BUG_ON(cgrp->subsys[i]);
908 BUG_ON(!dummytop->subsys[i]); 982 BUG_ON(!dummytop->subsys[i]);
909 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 983 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -915,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
915 if (ss->bind) 989 if (ss->bind)
916 ss->bind(ss, cgrp); 990 ss->bind(ss, cgrp);
917 mutex_unlock(&ss->hierarchy_mutex); 991 mutex_unlock(&ss->hierarchy_mutex);
992 /* refcount was already taken, and we're keeping it */
918 } else if (bit & removed_bits) { 993 } else if (bit & removed_bits) {
919 /* We're removing this subsystem */ 994 /* We're removing this subsystem */
995 BUG_ON(ss == NULL);
920 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 996 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
921 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 997 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
922 mutex_lock(&ss->hierarchy_mutex); 998 mutex_lock(&ss->hierarchy_mutex);
@@ -927,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
927 subsys[i]->root = &rootnode; 1003 subsys[i]->root = &rootnode;
928 list_move(&ss->sibling, &rootnode.subsys_list); 1004 list_move(&ss->sibling, &rootnode.subsys_list);
929 mutex_unlock(&ss->hierarchy_mutex); 1005 mutex_unlock(&ss->hierarchy_mutex);
1006 /* subsystem is now free - drop reference on module */
1007 module_put(ss->module);
930 } else if (bit & final_bits) { 1008 } else if (bit & final_bits) {
931 /* Subsystem state should already exist */ 1009 /* Subsystem state should already exist */
1010 BUG_ON(ss == NULL);
932 BUG_ON(!cgrp->subsys[i]); 1011 BUG_ON(!cgrp->subsys[i]);
1012 /*
1013 * a refcount was taken, but we already had one, so
1014 * drop the extra reference.
1015 */
1016 module_put(ss->module);
1017#ifdef CONFIG_MODULE_UNLOAD
1018 BUG_ON(ss->module && !module_refcount(ss->module));
1019#endif
933 } else { 1020 } else {
934 /* Subsystem state shouldn't exist */ 1021 /* Subsystem state shouldn't exist */
935 BUG_ON(cgrp->subsys[i]); 1022 BUG_ON(cgrp->subsys[i]);
@@ -971,13 +1058,20 @@ struct cgroup_sb_opts {
971 1058
972}; 1059};
973 1060
974/* Convert a hierarchy specifier into a bitmask of subsystems and 1061/*
975 * flags. */ 1062 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
976static int parse_cgroupfs_options(char *data, 1063 * with cgroup_mutex held to protect the subsys[] array. This function takes
977 struct cgroup_sb_opts *opts) 1064 * refcounts on subsystems to be used, unless it returns error, in which case
1065 * no refcounts are taken.
1066 */
1067static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
978{ 1068{
979 char *token, *o = data ?: "all"; 1069 char *token, *o = data ?: "all";
980 unsigned long mask = (unsigned long)-1; 1070 unsigned long mask = (unsigned long)-1;
1071 int i;
1072 bool module_pin_failed = false;
1073
1074 BUG_ON(!mutex_is_locked(&cgroup_mutex));
981 1075
982#ifdef CONFIG_CPUSETS 1076#ifdef CONFIG_CPUSETS
983 mask = ~(1UL << cpuset_subsys_id); 1077 mask = ~(1UL << cpuset_subsys_id);
@@ -990,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
990 return -EINVAL; 1084 return -EINVAL;
991 if (!strcmp(token, "all")) { 1085 if (!strcmp(token, "all")) {
992 /* Add all non-disabled subsystems */ 1086 /* Add all non-disabled subsystems */
993 int i;
994 opts->subsys_bits = 0; 1087 opts->subsys_bits = 0;
995 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1088 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
996 struct cgroup_subsys *ss = subsys[i]; 1089 struct cgroup_subsys *ss = subsys[i];
1090 if (ss == NULL)
1091 continue;
997 if (!ss->disabled) 1092 if (!ss->disabled)
998 opts->subsys_bits |= 1ul << i; 1093 opts->subsys_bits |= 1ul << i;
999 } 1094 }
@@ -1011,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
1011 if (!opts->release_agent) 1106 if (!opts->release_agent)
1012 return -ENOMEM; 1107 return -ENOMEM;
1013 } else if (!strncmp(token, "name=", 5)) { 1108 } else if (!strncmp(token, "name=", 5)) {
1014 int i;
1015 const char *name = token + 5; 1109 const char *name = token + 5;
1016 /* Can't specify an empty name */ 1110 /* Can't specify an empty name */
1017 if (!strlen(name)) 1111 if (!strlen(name))
@@ -1035,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
1035 return -ENOMEM; 1129 return -ENOMEM;
1036 } else { 1130 } else {
1037 struct cgroup_subsys *ss; 1131 struct cgroup_subsys *ss;
1038 int i;
1039 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1132 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1040 ss = subsys[i]; 1133 ss = subsys[i];
1134 if (ss == NULL)
1135 continue;
1041 if (!strcmp(token, ss->name)) { 1136 if (!strcmp(token, ss->name)) {
1042 if (!ss->disabled) 1137 if (!ss->disabled)
1043 set_bit(i, &opts->subsys_bits); 1138 set_bit(i, &opts->subsys_bits);
@@ -1072,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
1072 if (!opts->subsys_bits && !opts->name) 1167 if (!opts->subsys_bits && !opts->name)
1073 return -EINVAL; 1168 return -EINVAL;
1074 1169
1170 /*
1171 * Grab references on all the modules we'll need, so the subsystems
1172 * don't dance around before rebind_subsystems attaches them. This may
1173 * take duplicate reference counts on a subsystem that's already used,
1174 * but rebind_subsystems handles this case.
1175 */
1176 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1177 unsigned long bit = 1UL << i;
1178
1179 if (!(bit & opts->subsys_bits))
1180 continue;
1181 if (!try_module_get(subsys[i]->module)) {
1182 module_pin_failed = true;
1183 break;
1184 }
1185 }
1186 if (module_pin_failed) {
1187 /*
1188 * oops, one of the modules was going away. this means that we
1189 * raced with a module_delete call, and to the user this is
1190 * essentially a "subsystem doesn't exist" case.
1191 */
1192 for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
1193 /* drop refcounts only on the ones we took */
1194 unsigned long bit = 1UL << i;
1195
1196 if (!(bit & opts->subsys_bits))
1197 continue;
1198 module_put(subsys[i]->module);
1199 }
1200 return -ENOENT;
1201 }
1202
1075 return 0; 1203 return 0;
1076} 1204}
1077 1205
1206static void drop_parsed_module_refcounts(unsigned long subsys_bits)
1207{
1208 int i;
1209 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
1210 unsigned long bit = 1UL << i;
1211
1212 if (!(bit & subsys_bits))
1213 continue;
1214 module_put(subsys[i]->module);
1215 }
1216}
1217
1078static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1218static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1079{ 1219{
1080 int ret = 0; 1220 int ret = 0;
@@ -1091,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1091 if (ret) 1231 if (ret)
1092 goto out_unlock; 1232 goto out_unlock;
1093 1233
1094 /* Don't allow flags to change at remount */ 1234 /* Don't allow flags or name to change at remount */
1095 if (opts.flags != root->flags) { 1235 if (opts.flags != root->flags ||
1096 ret = -EINVAL; 1236 (opts.name && strcmp(opts.name, root->name))) {
1097 goto out_unlock;
1098 }
1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL; 1237 ret = -EINVAL;
1238 drop_parsed_module_refcounts(opts.subsys_bits);
1103 goto out_unlock; 1239 goto out_unlock;
1104 } 1240 }
1105 1241
1106 ret = rebind_subsystems(root, opts.subsys_bits); 1242 ret = rebind_subsystems(root, opts.subsys_bits);
1107 if (ret) 1243 if (ret) {
1244 drop_parsed_module_refcounts(opts.subsys_bits);
1108 goto out_unlock; 1245 goto out_unlock;
1246 }
1109 1247
1110 /* (re)populate subsystem files */ 1248 /* (re)populate subsystem files */
1111 cgroup_populate_dir(cgrp); 1249 cgroup_populate_dir(cgrp);
@@ -1136,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1136 INIT_LIST_HEAD(&cgrp->release_list); 1274 INIT_LIST_HEAD(&cgrp->release_list);
1137 INIT_LIST_HEAD(&cgrp->pidlists); 1275 INIT_LIST_HEAD(&cgrp->pidlists);
1138 mutex_init(&cgrp->pidlist_mutex); 1276 mutex_init(&cgrp->pidlist_mutex);
1277 INIT_LIST_HEAD(&cgrp->event_list);
1278 spin_lock_init(&cgrp->event_list_lock);
1139} 1279}
1140 1280
1141static void init_cgroup_root(struct cgroupfs_root *root) 1281static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1291,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1291 struct cgroupfs_root *new_root; 1431 struct cgroupfs_root *new_root;
1292 1432
1293 /* First find the desired set of subsystems */ 1433 /* First find the desired set of subsystems */
1434 mutex_lock(&cgroup_mutex);
1294 ret = parse_cgroupfs_options(data, &opts); 1435 ret = parse_cgroupfs_options(data, &opts);
1436 mutex_unlock(&cgroup_mutex);
1295 if (ret) 1437 if (ret)
1296 goto out_err; 1438 goto out_err;
1297 1439
@@ -1302,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1302 new_root = cgroup_root_from_opts(&opts); 1444 new_root = cgroup_root_from_opts(&opts);
1303 if (IS_ERR(new_root)) { 1445 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root); 1446 ret = PTR_ERR(new_root);
1305 goto out_err; 1447 goto drop_modules;
1306 } 1448 }
1307 opts.new_root = new_root; 1449 opts.new_root = new_root;
1308 1450
@@ -1311,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1311 if (IS_ERR(sb)) { 1453 if (IS_ERR(sb)) {
1312 ret = PTR_ERR(sb); 1454 ret = PTR_ERR(sb);
1313 cgroup_drop_root(opts.new_root); 1455 cgroup_drop_root(opts.new_root);
1314 goto out_err; 1456 goto drop_modules;
1315 } 1457 }
1316 1458
1317 root = sb->s_fs_info; 1459 root = sb->s_fs_info;
@@ -1367,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1367 free_cg_links(&tmp_cg_links); 1509 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super; 1510 goto drop_new_super;
1369 } 1511 }
1512 /*
1513 * There must be no failure case after here, since rebinding
1514 * takes care of subsystems' refcounts, which are explicitly
1515 * dropped in the failure exit path.
1516 */
1370 1517
1371 /* EBUSY should be the only error here */ 1518 /* EBUSY should be the only error here */
1372 BUG_ON(ret); 1519 BUG_ON(ret);
@@ -1405,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1405 * any) is not needed 1552 * any) is not needed
1406 */ 1553 */
1407 cgroup_drop_root(opts.new_root); 1554 cgroup_drop_root(opts.new_root);
1555 /* no subsys rebinding, so refcounts don't change */
1556 drop_parsed_module_refcounts(opts.subsys_bits);
1408 } 1557 }
1409 1558
1410 simple_set_mnt(mnt, sb); 1559 simple_set_mnt(mnt, sb);
@@ -1414,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1414 1563
1415 drop_new_super: 1564 drop_new_super:
1416 deactivate_locked_super(sb); 1565 deactivate_locked_super(sb);
1566 drop_modules:
1567 drop_parsed_module_refcounts(opts.subsys_bits);
1417 out_err: 1568 out_err:
1418 kfree(opts.release_agent); 1569 kfree(opts.release_agent);
1419 kfree(opts.name); 1570 kfree(opts.name);
@@ -1495,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1495int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1496{ 1647{
1497 char *start; 1648 char *start;
1498 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1499 1652
1500 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1501 /* 1654 /*
@@ -1511,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1511 *--start = '\0'; 1664 *--start = '\0';
1512 for (;;) { 1665 for (;;) {
1513 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1514 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1515 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1516 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1517 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1518 if (!cgrp) 1672 if (!cgrp)
1519 break; 1673 break;
1520 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1521 if (!cgrp->parent) 1678 if (!cgrp->parent)
1522 continue; 1679 continue;
1523 if (--start < buf) 1680 if (--start < buf)
@@ -1527,6 +1684,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1527 memmove(buf, start, buf + buflen - start); 1684 memmove(buf, start, buf + buflen - start);
1528 return 0; 1685 return 0;
1529} 1686}
1687EXPORT_SYMBOL_GPL(cgroup_path);
1530 1688
1531/** 1689/**
1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1690 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1539,7 +1697,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1539int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1697int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1540{ 1698{
1541 int retval = 0; 1699 int retval = 0;
1542 struct cgroup_subsys *ss; 1700 struct cgroup_subsys *ss, *failed_ss = NULL;
1543 struct cgroup *oldcgrp; 1701 struct cgroup *oldcgrp;
1544 struct css_set *cg; 1702 struct css_set *cg;
1545 struct css_set *newcg; 1703 struct css_set *newcg;
@@ -1553,8 +1711,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1553 for_each_subsys(root, ss) { 1711 for_each_subsys(root, ss) {
1554 if (ss->can_attach) { 1712 if (ss->can_attach) {
1555 retval = ss->can_attach(ss, cgrp, tsk, false); 1713 retval = ss->can_attach(ss, cgrp, tsk, false);
1556 if (retval) 1714 if (retval) {
1557 return retval; 1715 /*
1716 * Remember on which subsystem the can_attach()
1717 * failed, so that we only call cancel_attach()
1718 * against the subsystems whose can_attach()
1719 * succeeded. (See below)
1720 */
1721 failed_ss = ss;
1722 goto out;
1723 }
1558 } 1724 }
1559 } 1725 }
1560 1726
@@ -1568,14 +1734,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1568 */ 1734 */
1569 newcg = find_css_set(cg, cgrp); 1735 newcg = find_css_set(cg, cgrp);
1570 put_css_set(cg); 1736 put_css_set(cg);
1571 if (!newcg) 1737 if (!newcg) {
1572 return -ENOMEM; 1738 retval = -ENOMEM;
1739 goto out;
1740 }
1573 1741
1574 task_lock(tsk); 1742 task_lock(tsk);
1575 if (tsk->flags & PF_EXITING) { 1743 if (tsk->flags & PF_EXITING) {
1576 task_unlock(tsk); 1744 task_unlock(tsk);
1577 put_css_set(newcg); 1745 put_css_set(newcg);
1578 return -ESRCH; 1746 retval = -ESRCH;
1747 goto out;
1579 } 1748 }
1580 rcu_assign_pointer(tsk->cgroups, newcg); 1749 rcu_assign_pointer(tsk->cgroups, newcg);
1581 task_unlock(tsk); 1750 task_unlock(tsk);
@@ -1601,7 +1770,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1601 * is no longer empty. 1770 * is no longer empty.
1602 */ 1771 */
1603 cgroup_wakeup_rmdir_waiter(cgrp); 1772 cgroup_wakeup_rmdir_waiter(cgrp);
1604 return 0; 1773out:
1774 if (retval) {
1775 for_each_subsys(root, ss) {
1776 if (ss == failed_ss)
1777 /*
1778 * This subsystem was the one that failed the
1779 * can_attach() check earlier, so we don't need
1780 * to call cancel_attach() against it or any
1781 * remaining subsystems.
1782 */
1783 break;
1784 if (ss->cancel_attach)
1785 ss->cancel_attach(ss, cgrp, tsk, false);
1786 }
1787 }
1788 return retval;
1605} 1789}
1606 1790
1607/* 1791/*
@@ -1667,6 +1851,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
1667 } 1851 }
1668 return true; 1852 return true;
1669} 1853}
1854EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
1670 1855
1671static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 1856static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1672 const char *buffer) 1857 const char *buffer)
@@ -1935,6 +2120,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
1935 .rename = cgroup_rename, 2120 .rename = cgroup_rename,
1936}; 2121};
1937 2122
2123/*
2124 * Check if a file is a control file
2125 */
2126static inline struct cftype *__file_cft(struct file *file)
2127{
2128 if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
2129 return ERR_PTR(-EINVAL);
2130 return __d_cft(file->f_dentry);
2131}
2132
1938static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2133static int cgroup_create_file(struct dentry *dentry, mode_t mode,
1939 struct super_block *sb) 2134 struct super_block *sb)
1940{ 2135{
@@ -2054,6 +2249,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2054 error = PTR_ERR(dentry); 2249 error = PTR_ERR(dentry);
2055 return error; 2250 return error;
2056} 2251}
2252EXPORT_SYMBOL_GPL(cgroup_add_file);
2057 2253
2058int cgroup_add_files(struct cgroup *cgrp, 2254int cgroup_add_files(struct cgroup *cgrp,
2059 struct cgroup_subsys *subsys, 2255 struct cgroup_subsys *subsys,
@@ -2068,6 +2264,7 @@ int cgroup_add_files(struct cgroup *cgrp,
2068 } 2264 }
2069 return 0; 2265 return 0;
2070} 2266}
2267EXPORT_SYMBOL_GPL(cgroup_add_files);
2071 2268
2072/** 2269/**
2073 * cgroup_task_count - count the number of tasks in a cgroup. 2270 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2453,7 +2650,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2453{ 2650{
2454 struct cgroup_pidlist *l; 2651 struct cgroup_pidlist *l;
2455 /* don't need task_nsproxy() if we're looking at ourself */ 2652 /* don't need task_nsproxy() if we're looking at ourself */
2456 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); 2653 struct pid_namespace *ns = current->nsproxy->pid_ns;
2654
2457 /* 2655 /*
2458 * We can't drop the pidlist_mutex before taking the l->mutex in case 2656 * We can't drop the pidlist_mutex before taking the l->mutex in case
2459 * the last ref-holder is trying to remove l from the list at the same 2657 * the last ref-holder is trying to remove l from the list at the same
@@ -2463,8 +2661,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2463 mutex_lock(&cgrp->pidlist_mutex); 2661 mutex_lock(&cgrp->pidlist_mutex);
2464 list_for_each_entry(l, &cgrp->pidlists, links) { 2662 list_for_each_entry(l, &cgrp->pidlists, links) {
2465 if (l->key.type == type && l->key.ns == ns) { 2663 if (l->key.type == type && l->key.ns == ns) {
2466 /* found a matching list - drop the extra refcount */
2467 put_pid_ns(ns);
2468 /* make sure l doesn't vanish out from under us */ 2664 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2665 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2666 mutex_unlock(&cgrp->pidlist_mutex);
@@ -2475,13 +2671,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2475 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 2671 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2476 if (!l) { 2672 if (!l) {
2477 mutex_unlock(&cgrp->pidlist_mutex); 2673 mutex_unlock(&cgrp->pidlist_mutex);
2478 put_pid_ns(ns);
2479 return l; 2674 return l;
2480 } 2675 }
2481 init_rwsem(&l->mutex); 2676 init_rwsem(&l->mutex);
2482 down_write(&l->mutex); 2677 down_write(&l->mutex);
2483 l->key.type = type; 2678 l->key.type = type;
2484 l->key.ns = ns; 2679 l->key.ns = get_pid_ns(ns);
2485 l->use_count = 0; /* don't increment here */ 2680 l->use_count = 0; /* don't increment here */
2486 l->list = NULL; 2681 l->list = NULL;
2487 l->owner = cgrp; 2682 l->owner = cgrp;
@@ -2789,6 +2984,173 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2789} 2984}
2790 2985
2791/* 2986/*
2987 * Unregister event and free resources.
2988 *
2989 * Gets called from workqueue.
2990 */
2991static void cgroup_event_remove(struct work_struct *work)
2992{
2993 struct cgroup_event *event = container_of(work, struct cgroup_event,
2994 remove);
2995 struct cgroup *cgrp = event->cgrp;
2996
2997 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2998
2999 eventfd_ctx_put(event->eventfd);
3000 kfree(event);
3001 dput(cgrp->dentry);
3002}
3003
3004/*
3005 * Gets called on POLLHUP on eventfd when user closes it.
3006 *
3007 * Called with wqh->lock held and interrupts disabled.
3008 */
3009static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3010 int sync, void *key)
3011{
3012 struct cgroup_event *event = container_of(wait,
3013 struct cgroup_event, wait);
3014 struct cgroup *cgrp = event->cgrp;
3015 unsigned long flags = (unsigned long)key;
3016
3017 if (flags & POLLHUP) {
3018 __remove_wait_queue(event->wqh, &event->wait);
3019 spin_lock(&cgrp->event_list_lock);
3020 list_del(&event->list);
3021 spin_unlock(&cgrp->event_list_lock);
3022 /*
3023 * We are in atomic context, but cgroup_event_remove() may
3024 * sleep, so we have to call it in workqueue.
3025 */
3026 schedule_work(&event->remove);
3027 }
3028
3029 return 0;
3030}
3031
3032static void cgroup_event_ptable_queue_proc(struct file *file,
3033 wait_queue_head_t *wqh, poll_table *pt)
3034{
3035 struct cgroup_event *event = container_of(pt,
3036 struct cgroup_event, pt);
3037
3038 event->wqh = wqh;
3039 add_wait_queue(wqh, &event->wait);
3040}
3041
3042/*
3043 * Parse input and register new cgroup event handler.
3044 *
3045 * Input must be in format '<event_fd> <control_fd> <args>'.
3046 * Interpretation of args is defined by control file implementation.
3047 */
3048static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3049 const char *buffer)
3050{
3051 struct cgroup_event *event = NULL;
3052 unsigned int efd, cfd;
3053 struct file *efile = NULL;
3054 struct file *cfile = NULL;
3055 char *endp;
3056 int ret;
3057
3058 efd = simple_strtoul(buffer, &endp, 10);
3059 if (*endp != ' ')
3060 return -EINVAL;
3061 buffer = endp + 1;
3062
3063 cfd = simple_strtoul(buffer, &endp, 10);
3064 if ((*endp != ' ') && (*endp != '\0'))
3065 return -EINVAL;
3066 buffer = endp + 1;
3067
3068 event = kzalloc(sizeof(*event), GFP_KERNEL);
3069 if (!event)
3070 return -ENOMEM;
3071 event->cgrp = cgrp;
3072 INIT_LIST_HEAD(&event->list);
3073 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3074 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
3075 INIT_WORK(&event->remove, cgroup_event_remove);
3076
3077 efile = eventfd_fget(efd);
3078 if (IS_ERR(efile)) {
3079 ret = PTR_ERR(efile);
3080 goto fail;
3081 }
3082
3083 event->eventfd = eventfd_ctx_fileget(efile);
3084 if (IS_ERR(event->eventfd)) {
3085 ret = PTR_ERR(event->eventfd);
3086 goto fail;
3087 }
3088
3089 cfile = fget(cfd);
3090 if (!cfile) {
3091 ret = -EBADF;
3092 goto fail;
3093 }
3094
3095 /* the process need read permission on control file */
3096 ret = file_permission(cfile, MAY_READ);
3097 if (ret < 0)
3098 goto fail;
3099
3100 event->cft = __file_cft(cfile);
3101 if (IS_ERR(event->cft)) {
3102 ret = PTR_ERR(event->cft);
3103 goto fail;
3104 }
3105
3106 if (!event->cft->register_event || !event->cft->unregister_event) {
3107 ret = -EINVAL;
3108 goto fail;
3109 }
3110
3111 ret = event->cft->register_event(cgrp, event->cft,
3112 event->eventfd, buffer);
3113 if (ret)
3114 goto fail;
3115
3116 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
3117 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3118 ret = 0;
3119 goto fail;
3120 }
3121
3122 /*
3123 * Events should be removed after rmdir of cgroup directory, but before
3124 * destroying subsystem state objects. Let's take reference to cgroup
3125 * directory dentry to do that.
3126 */
3127 dget(cgrp->dentry);
3128
3129 spin_lock(&cgrp->event_list_lock);
3130 list_add(&event->list, &cgrp->event_list);
3131 spin_unlock(&cgrp->event_list_lock);
3132
3133 fput(cfile);
3134 fput(efile);
3135
3136 return 0;
3137
3138fail:
3139 if (cfile)
3140 fput(cfile);
3141
3142 if (event && event->eventfd && !IS_ERR(event->eventfd))
3143 eventfd_ctx_put(event->eventfd);
3144
3145 if (!IS_ERR_OR_NULL(efile))
3146 fput(efile);
3147
3148 kfree(event);
3149
3150 return ret;
3151}
3152
3153/*
2792 * for the common functions, 'private' gives the type of file 3154 * for the common functions, 'private' gives the type of file
2793 */ 3155 */
2794/* for hysterical raisins, we can't put this on the older files */ 3156/* for hysterical raisins, we can't put this on the older files */
@@ -2813,6 +3175,11 @@ static struct cftype files[] = {
2813 .read_u64 = cgroup_read_notify_on_release, 3175 .read_u64 = cgroup_read_notify_on_release,
2814 .write_u64 = cgroup_write_notify_on_release, 3176 .write_u64 = cgroup_write_notify_on_release,
2815 }, 3177 },
3178 {
3179 .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
3180 .write_string = cgroup_write_event_control,
3181 .mode = S_IWUGO,
3182 },
2816}; 3183};
2817 3184
2818static struct cftype cft_release_agent = { 3185static struct cftype cft_release_agent = {
@@ -2877,8 +3244,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2877 /* We need to take each hierarchy_mutex in a consistent order */ 3244 /* We need to take each hierarchy_mutex in a consistent order */
2878 int i; 3245 int i;
2879 3246
3247 /*
3248 * No worry about a race with rebind_subsystems that might mess up the
3249 * locking order, since both parties are under cgroup_mutex.
3250 */
2880 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3251 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2881 struct cgroup_subsys *ss = subsys[i]; 3252 struct cgroup_subsys *ss = subsys[i];
3253 if (ss == NULL)
3254 continue;
2882 if (ss->root == root) 3255 if (ss->root == root)
2883 mutex_lock(&ss->hierarchy_mutex); 3256 mutex_lock(&ss->hierarchy_mutex);
2884 } 3257 }
@@ -2890,6 +3263,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2890 3263
2891 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3264 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2892 struct cgroup_subsys *ss = subsys[i]; 3265 struct cgroup_subsys *ss = subsys[i];
3266 if (ss == NULL)
3267 continue;
2893 if (ss->root == root) 3268 if (ss->root == root)
2894 mutex_unlock(&ss->hierarchy_mutex); 3269 mutex_unlock(&ss->hierarchy_mutex);
2895 } 3270 }
@@ -2936,14 +3311,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2936 3311
2937 for_each_subsys(root, ss) { 3312 for_each_subsys(root, ss) {
2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3313 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
3314
2939 if (IS_ERR(css)) { 3315 if (IS_ERR(css)) {
2940 err = PTR_ERR(css); 3316 err = PTR_ERR(css);
2941 goto err_destroy; 3317 goto err_destroy;
2942 } 3318 }
2943 init_cgroup_css(css, ss, cgrp); 3319 init_cgroup_css(css, ss, cgrp);
2944 if (ss->use_id) 3320 if (ss->use_id) {
2945 if (alloc_css_id(ss, parent, cgrp)) 3321 err = alloc_css_id(ss, parent, cgrp);
3322 if (err)
2946 goto err_destroy; 3323 goto err_destroy;
3324 }
2947 /* At error, ->destroy() callback has to free assigned ID. */ 3325 /* At error, ->destroy() callback has to free assigned ID. */
2948 } 3326 }
2949 3327
@@ -3010,11 +3388,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
3010 * synchronization other than RCU, and the subsystem linked 3388 * synchronization other than RCU, and the subsystem linked
3011 * list isn't RCU-safe */ 3389 * list isn't RCU-safe */
3012 int i; 3390 int i;
3391 /*
3392 * We won't need to lock the subsys array, because the subsystems
3393 * we're concerned about aren't going anywhere since our cgroup root
3394 * has a reference on them.
3395 */
3013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3396 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3014 struct cgroup_subsys *ss = subsys[i]; 3397 struct cgroup_subsys *ss = subsys[i];
3015 struct cgroup_subsys_state *css; 3398 struct cgroup_subsys_state *css;
3016 /* Skip subsystems not in this hierarchy */ 3399 /* Skip subsystems not present or not in this hierarchy */
3017 if (ss->root != cgrp->root) 3400 if (ss == NULL || ss->root != cgrp->root)
3018 continue; 3401 continue;
3019 css = cgrp->subsys[ss->subsys_id]; 3402 css = cgrp->subsys[ss->subsys_id];
3020 /* When called from check_for_release() it's possible 3403 /* When called from check_for_release() it's possible
@@ -3088,6 +3471,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
3088 struct dentry *d; 3471 struct dentry *d;
3089 struct cgroup *parent; 3472 struct cgroup *parent;
3090 DEFINE_WAIT(wait); 3473 DEFINE_WAIT(wait);
3474 struct cgroup_event *event, *tmp;
3091 int ret; 3475 int ret;
3092 3476
3093 /* the vfs holds both inode->i_mutex already */ 3477 /* the vfs holds both inode->i_mutex already */
@@ -3171,6 +3555,20 @@ again:
3171 set_bit(CGRP_RELEASABLE, &parent->flags); 3555 set_bit(CGRP_RELEASABLE, &parent->flags);
3172 check_for_release(parent); 3556 check_for_release(parent);
3173 3557
3558 /*
3559 * Unregister events and notify userspace.
3560 * Notify userspace about cgroup removing only after rmdir of cgroup
3561 * directory to avoid race between userspace and kernelspace
3562 */
3563 spin_lock(&cgrp->event_list_lock);
3564 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
3565 list_del(&event->list);
3566 remove_wait_queue(event->wqh, &event->wait);
3567 eventfd_signal(event->eventfd, 1);
3568 schedule_work(&event->remove);
3569 }
3570 spin_unlock(&cgrp->event_list_lock);
3571
3174 mutex_unlock(&cgroup_mutex); 3572 mutex_unlock(&cgroup_mutex);
3175 return 0; 3573 return 0;
3176} 3574}
@@ -3205,9 +3603,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3205 mutex_init(&ss->hierarchy_mutex); 3603 mutex_init(&ss->hierarchy_mutex);
3206 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); 3604 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3207 ss->active = 1; 3605 ss->active = 1;
3606
3607 /* this function shouldn't be used with modular subsystems, since they
3608 * need to register a subsys_id, among other things */
3609 BUG_ON(ss->module);
3208} 3610}
3209 3611
3210/** 3612/**
3613 * cgroup_load_subsys: load and register a modular subsystem at runtime
3614 * @ss: the subsystem to load
3615 *
3616 * This function should be called in a modular subsystem's initcall. If the
3617 * subsystem is built as a module, it will be assigned a new subsys_id and set
3618 * up for use. If the subsystem is built-in anyway, work is delegated to the
3619 * simpler cgroup_init_subsys.
3620 */
3621int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
3622{
3623 int i;
3624 struct cgroup_subsys_state *css;
3625
3626 /* check name and function validity */
3627 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
3628 ss->create == NULL || ss->destroy == NULL)
3629 return -EINVAL;
3630
3631 /*
3632 * we don't support callbacks in modular subsystems. this check is
3633 * before the ss->module check for consistency; a subsystem that could
3634 * be a module should still have no callbacks even if the user isn't
3635 * compiling it as one.
3636 */
3637 if (ss->fork || ss->exit)
3638 return -EINVAL;
3639
3640 /*
3641 * an optionally modular subsystem is built-in: we want to do nothing,
3642 * since cgroup_init_subsys will have already taken care of it.
3643 */
3644 if (ss->module == NULL) {
3645 /* a few sanity checks */
3646 BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
3647 BUG_ON(subsys[ss->subsys_id] != ss);
3648 return 0;
3649 }
3650
3651 /*
3652 * need to register a subsys id before anything else - for example,
3653 * init_cgroup_css needs it.
3654 */
3655 mutex_lock(&cgroup_mutex);
3656 /* find the first empty slot in the array */
3657 for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
3658 if (subsys[i] == NULL)
3659 break;
3660 }
3661 if (i == CGROUP_SUBSYS_COUNT) {
3662 /* maximum number of subsystems already registered! */
3663 mutex_unlock(&cgroup_mutex);
3664 return -EBUSY;
3665 }
3666 /* assign ourselves the subsys_id */
3667 ss->subsys_id = i;
3668 subsys[i] = ss;
3669
3670 /*
3671 * no ss->create seems to need anything important in the ss struct, so
3672 * this can happen first (i.e. before the rootnode attachment).
3673 */
3674 css = ss->create(ss, dummytop);
3675 if (IS_ERR(css)) {
3676 /* failure case - need to deassign the subsys[] slot. */
3677 subsys[i] = NULL;
3678 mutex_unlock(&cgroup_mutex);
3679 return PTR_ERR(css);
3680 }
3681
3682 list_add(&ss->sibling, &rootnode.subsys_list);
3683 ss->root = &rootnode;
3684
3685 /* our new subsystem will be attached to the dummy hierarchy. */
3686 init_cgroup_css(css, ss, dummytop);
3687 /* init_idr must be after init_cgroup_css because it sets css->id. */
3688 if (ss->use_id) {
3689 int ret = cgroup_init_idr(ss, css);
3690 if (ret) {
3691 dummytop->subsys[ss->subsys_id] = NULL;
3692 ss->destroy(ss, dummytop);
3693 subsys[i] = NULL;
3694 mutex_unlock(&cgroup_mutex);
3695 return ret;
3696 }
3697 }
3698
3699 /*
3700 * Now we need to entangle the css into the existing css_sets. unlike
3701 * in cgroup_init_subsys, there are now multiple css_sets, so each one
3702 * will need a new pointer to it; done by iterating the css_set_table.
3703 * furthermore, modifying the existing css_sets will corrupt the hash
3704 * table state, so each changed css_set will need its hash recomputed.
3705 * this is all done under the css_set_lock.
3706 */
3707 write_lock(&css_set_lock);
3708 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
3709 struct css_set *cg;
3710 struct hlist_node *node, *tmp;
3711 struct hlist_head *bucket = &css_set_table[i], *new_bucket;
3712
3713 hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
3714 /* skip entries that we already rehashed */
3715 if (cg->subsys[ss->subsys_id])
3716 continue;
3717 /* remove existing entry */
3718 hlist_del(&cg->hlist);
3719 /* set new value */
3720 cg->subsys[ss->subsys_id] = css;
3721 /* recompute hash and restore entry */
3722 new_bucket = css_set_hash(cg->subsys);
3723 hlist_add_head(&cg->hlist, new_bucket);
3724 }
3725 }
3726 write_unlock(&css_set_lock);
3727
3728 mutex_init(&ss->hierarchy_mutex);
3729 lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
3730 ss->active = 1;
3731
3732 /* success! */
3733 mutex_unlock(&cgroup_mutex);
3734 return 0;
3735}
3736EXPORT_SYMBOL_GPL(cgroup_load_subsys);
3737
3738/**
3739 * cgroup_unload_subsys: unload a modular subsystem
3740 * @ss: the subsystem to unload
3741 *
3742 * This function should be called in a modular subsystem's exitcall. When this
3743 * function is invoked, the refcount on the subsystem's module will be 0, so
3744 * the subsystem will not be attached to any hierarchy.
3745 */
3746void cgroup_unload_subsys(struct cgroup_subsys *ss)
3747{
3748 struct cg_cgroup_link *link;
3749 struct hlist_head *hhead;
3750
3751 BUG_ON(ss->module == NULL);
3752
3753 /*
3754 * we shouldn't be called if the subsystem is in use, and the use of
3755 * try_module_get in parse_cgroupfs_options should ensure that it
3756 * doesn't start being used while we're killing it off.
3757 */
3758 BUG_ON(ss->root != &rootnode);
3759
3760 mutex_lock(&cgroup_mutex);
3761 /* deassign the subsys_id */
3762 BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
3763 subsys[ss->subsys_id] = NULL;
3764
3765 /* remove subsystem from rootnode's list of subsystems */
3766 list_del(&ss->sibling);
3767
3768 /*
3769 * disentangle the css from all css_sets attached to the dummytop. as
3770 * in loading, we need to pay our respects to the hashtable gods.
3771 */
3772 write_lock(&css_set_lock);
3773 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
3774 struct css_set *cg = link->cg;
3775
3776 hlist_del(&cg->hlist);
3777 BUG_ON(!cg->subsys[ss->subsys_id]);
3778 cg->subsys[ss->subsys_id] = NULL;
3779 hhead = css_set_hash(cg->subsys);
3780 hlist_add_head(&cg->hlist, hhead);
3781 }
3782 write_unlock(&css_set_lock);
3783
3784 /*
3785 * remove subsystem's css from the dummytop and free it - need to free
3786 * before marking as null because ss->destroy needs the cgrp->subsys
3787 * pointer to find their state. note that this also takes care of
3788 * freeing the css_id.
3789 */
3790 ss->destroy(ss, dummytop);
3791 dummytop->subsys[ss->subsys_id] = NULL;
3792
3793 mutex_unlock(&cgroup_mutex);
3794}
3795EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
3796
3797/**
3211 * cgroup_init_early - cgroup initialization at system boot 3798 * cgroup_init_early - cgroup initialization at system boot
3212 * 3799 *
3213 * Initialize cgroups at system boot, and initialize any 3800 * Initialize cgroups at system boot, and initialize any
@@ -3235,7 +3822,8 @@ int __init cgroup_init_early(void)
3235 for (i = 0; i < CSS_SET_TABLE_SIZE; i++) 3822 for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
3236 INIT_HLIST_HEAD(&css_set_table[i]); 3823 INIT_HLIST_HEAD(&css_set_table[i]);
3237 3824
3238 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3825 /* at bootup time, we don't worry about modular subsystems */
3826 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3239 struct cgroup_subsys *ss = subsys[i]; 3827 struct cgroup_subsys *ss = subsys[i];
3240 3828
3241 BUG_ON(!ss->name); 3829 BUG_ON(!ss->name);
@@ -3270,12 +3858,13 @@ int __init cgroup_init(void)
3270 if (err) 3858 if (err)
3271 return err; 3859 return err;
3272 3860
3273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3861 /* at bootup time, we don't worry about modular subsystems */
3862 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3274 struct cgroup_subsys *ss = subsys[i]; 3863 struct cgroup_subsys *ss = subsys[i];
3275 if (!ss->early_init) 3864 if (!ss->early_init)
3276 cgroup_init_subsys(ss); 3865 cgroup_init_subsys(ss);
3277 if (ss->use_id) 3866 if (ss->use_id)
3278 cgroup_subsys_init_idr(ss); 3867 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
3279 } 3868 }
3280 3869
3281 /* Add init_css_set to the hash table */ 3870 /* Add init_css_set to the hash table */
@@ -3379,9 +3968,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3379 int i; 3968 int i;
3380 3969
3381 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 3970 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
3971 /*
3972 * ideally we don't want subsystems moving around while we do this.
3973 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
3974 * subsys/hierarchy state.
3975 */
3382 mutex_lock(&cgroup_mutex); 3976 mutex_lock(&cgroup_mutex);
3383 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3977 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3384 struct cgroup_subsys *ss = subsys[i]; 3978 struct cgroup_subsys *ss = subsys[i];
3979 if (ss == NULL)
3980 continue;
3385 seq_printf(m, "%s\t%d\t%d\t%d\n", 3981 seq_printf(m, "%s\t%d\t%d\t%d\n",
3386 ss->name, ss->root->hierarchy_id, 3982 ss->name, ss->root->hierarchy_id,
3387 ss->root->number_of_cgroups, !ss->disabled); 3983 ss->root->number_of_cgroups, !ss->disabled);
@@ -3439,7 +4035,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
3439{ 4035{
3440 if (need_forkexit_callback) { 4036 if (need_forkexit_callback) {
3441 int i; 4037 int i;
3442 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4038 /*
4039 * forkexit callbacks are only supported for builtin
4040 * subsystems, and the builtin section of the subsys array is
4041 * immutable, so we don't need to lock the subsys array here.
4042 */
4043 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3443 struct cgroup_subsys *ss = subsys[i]; 4044 struct cgroup_subsys *ss = subsys[i];
3444 if (ss->fork) 4045 if (ss->fork)
3445 ss->fork(ss, child); 4046 ss->fork(ss, child);
@@ -3508,7 +4109,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
3508 struct css_set *cg; 4109 struct css_set *cg;
3509 4110
3510 if (run_callbacks && need_forkexit_callback) { 4111 if (run_callbacks && need_forkexit_callback) {
3511 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4112 /*
4113 * modular subsystems can't use callbacks, so no need to lock
4114 * the subsys array
4115 */
4116 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3512 struct cgroup_subsys *ss = subsys[i]; 4117 struct cgroup_subsys *ss = subsys[i];
3513 if (ss->exit) 4118 if (ss->exit)
3514 ss->exit(ss, tsk); 4119 ss->exit(ss, tsk);
@@ -3702,12 +4307,13 @@ static void check_for_release(struct cgroup *cgrp)
3702 } 4307 }
3703} 4308}
3704 4309
3705void __css_put(struct cgroup_subsys_state *css) 4310/* Caller must verify that the css is not for root cgroup */
4311void __css_put(struct cgroup_subsys_state *css, int count)
3706{ 4312{
3707 struct cgroup *cgrp = css->cgroup; 4313 struct cgroup *cgrp = css->cgroup;
3708 int val; 4314 int val;
3709 rcu_read_lock(); 4315 rcu_read_lock();
3710 val = atomic_dec_return(&css->refcnt); 4316 val = atomic_sub_return(count, &css->refcnt);
3711 if (val == 1) { 4317 if (val == 1) {
3712 if (notify_on_release(cgrp)) { 4318 if (notify_on_release(cgrp)) {
3713 set_bit(CGRP_RELEASABLE, &cgrp->flags); 4319 set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3718,6 +4324,7 @@ void __css_put(struct cgroup_subsys_state *css)
3718 rcu_read_unlock(); 4324 rcu_read_unlock();
3719 WARN_ON_ONCE(val < 1); 4325 WARN_ON_ONCE(val < 1);
3720} 4326}
4327EXPORT_SYMBOL_GPL(__css_put);
3721 4328
3722/* 4329/*
3723 * Notify userspace when a cgroup is released, by running the 4330 * Notify userspace when a cgroup is released, by running the
@@ -3799,8 +4406,11 @@ static int __init cgroup_disable(char *str)
3799 while ((token = strsep(&str, ",")) != NULL) { 4406 while ((token = strsep(&str, ",")) != NULL) {
3800 if (!*token) 4407 if (!*token)
3801 continue; 4408 continue;
3802 4409 /*
3803 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4410 * cgroup_disable, being at boot time, can't know about module
4411 * subsystems, so we don't worry about them.
4412 */
4413 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
3804 struct cgroup_subsys *ss = subsys[i]; 4414 struct cgroup_subsys *ss = subsys[i];
3805 4415
3806 if (!strcmp(token, ss->name)) { 4416 if (!strcmp(token, ss->name)) {
@@ -3824,31 +4434,65 @@ __setup("cgroup_disable=", cgroup_disable);
3824 */ 4434 */
3825unsigned short css_id(struct cgroup_subsys_state *css) 4435unsigned short css_id(struct cgroup_subsys_state *css)
3826{ 4436{
3827 struct css_id *cssid = rcu_dereference(css->id); 4437 struct css_id *cssid;
4438
4439 /*
4440 * This css_id() can return correct value when somone has refcnt
4441 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4442 * it's unchanged until freed.
4443 */
4444 cssid = rcu_dereference_check(css->id,
4445 rcu_read_lock_held() || atomic_read(&css->refcnt));
3828 4446
3829 if (cssid) 4447 if (cssid)
3830 return cssid->id; 4448 return cssid->id;
3831 return 0; 4449 return 0;
3832} 4450}
4451EXPORT_SYMBOL_GPL(css_id);
3833 4452
3834unsigned short css_depth(struct cgroup_subsys_state *css) 4453unsigned short css_depth(struct cgroup_subsys_state *css)
3835{ 4454{
3836 struct css_id *cssid = rcu_dereference(css->id); 4455 struct css_id *cssid;
4456
4457 cssid = rcu_dereference_check(css->id,
4458 rcu_read_lock_held() || atomic_read(&css->refcnt));
3837 4459
3838 if (cssid) 4460 if (cssid)
3839 return cssid->depth; 4461 return cssid->depth;
3840 return 0; 4462 return 0;
3841} 4463}
4464EXPORT_SYMBOL_GPL(css_depth);
4465
4466/**
4467 * css_is_ancestor - test "root" css is an ancestor of "child"
4468 * @child: the css to be tested.
4469 * @root: the css supporsed to be an ancestor of the child.
4470 *
4471 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4472 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4473 * But, considering usual usage, the csses should be valid objects after test.
4474 * Assuming that the caller will do some action to the child if this returns
4475 * returns true, the caller must take "child";s reference count.
4476 * If "child" is valid object and this returns true, "root" is valid, too.
4477 */
3842 4478
3843bool css_is_ancestor(struct cgroup_subsys_state *child, 4479bool css_is_ancestor(struct cgroup_subsys_state *child,
3844 const struct cgroup_subsys_state *root) 4480 const struct cgroup_subsys_state *root)
3845{ 4481{
3846 struct css_id *child_id = rcu_dereference(child->id); 4482 struct css_id *child_id;
3847 struct css_id *root_id = rcu_dereference(root->id); 4483 struct css_id *root_id;
4484 bool ret = true;
3848 4485
3849 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4486 rcu_read_lock();
3850 return false; 4487 child_id = rcu_dereference(child->id);
3851 return child_id->stack[root_id->depth] == root_id->id; 4488 root_id = rcu_dereference(root->id);
4489 if (!child_id
4490 || !root_id
4491 || (child_id->depth < root_id->depth)
4492 || (child_id->stack[root_id->depth] != root_id->id))
4493 ret = false;
4494 rcu_read_unlock();
4495 return ret;
3852} 4496}
3853 4497
3854static void __free_css_id_cb(struct rcu_head *head) 4498static void __free_css_id_cb(struct rcu_head *head)
@@ -3875,6 +4519,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3875 spin_unlock(&ss->id_lock); 4519 spin_unlock(&ss->id_lock);
3876 call_rcu(&id->rcu_head, __free_css_id_cb); 4520 call_rcu(&id->rcu_head, __free_css_id_cb);
3877} 4521}
4522EXPORT_SYMBOL_GPL(free_css_id);
3878 4523
3879/* 4524/*
3880 * This is called by init or create(). Then, calls to this function are 4525 * This is called by init or create(). Then, calls to this function are
@@ -3924,15 +4569,14 @@ err_out:
3924 4569
3925} 4570}
3926 4571
3927static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) 4572static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4573 struct cgroup_subsys_state *rootcss)
3928{ 4574{
3929 struct css_id *newid; 4575 struct css_id *newid;
3930 struct cgroup_subsys_state *rootcss;
3931 4576
3932 spin_lock_init(&ss->id_lock); 4577 spin_lock_init(&ss->id_lock);
3933 idr_init(&ss->idr); 4578 idr_init(&ss->idr);
3934 4579
3935 rootcss = init_css_set.subsys[ss->subsys_id];
3936 newid = get_new_cssid(ss, 0); 4580 newid = get_new_cssid(ss, 0);
3937 if (IS_ERR(newid)) 4581 if (IS_ERR(newid))
3938 return PTR_ERR(newid); 4582 return PTR_ERR(newid);
@@ -3948,13 +4592,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3948{ 4592{
3949 int subsys_id, i, depth = 0; 4593 int subsys_id, i, depth = 0;
3950 struct cgroup_subsys_state *parent_css, *child_css; 4594 struct cgroup_subsys_state *parent_css, *child_css;
3951 struct css_id *child_id, *parent_id = NULL; 4595 struct css_id *child_id, *parent_id;
3952 4596
3953 subsys_id = ss->subsys_id; 4597 subsys_id = ss->subsys_id;
3954 parent_css = parent->subsys[subsys_id]; 4598 parent_css = parent->subsys[subsys_id];
3955 child_css = child->subsys[subsys_id]; 4599 child_css = child->subsys[subsys_id];
3956 depth = css_depth(parent_css) + 1;
3957 parent_id = parent_css->id; 4600 parent_id = parent_css->id;
4601 depth = parent_id->depth + 1;
3958 4602
3959 child_id = get_new_cssid(ss, depth); 4603 child_id = get_new_cssid(ss, depth);
3960 if (IS_ERR(child_id)) 4604 if (IS_ERR(child_id))
@@ -3992,6 +4636,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3992 4636
3993 return rcu_dereference(cssid->css); 4637 return rcu_dereference(cssid->css);
3994} 4638}
4639EXPORT_SYMBOL_GPL(css_lookup);
3995 4640
3996/** 4641/**
3997 * css_get_next - lookup next cgroup under specified hierarchy. 4642 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/slab.h>
18#include <linux/cgroup.h> 19#include <linux/cgroup.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
47 struct freezer, css); 48 struct freezer, css);
48} 49}
49 50
50int cgroup_frozen(struct task_struct *task) 51int cgroup_freezing_or_frozen(struct task_struct *task)
51{ 52{
52 struct freezer *freezer; 53 struct freezer *freezer;
53 enum freezer_state state; 54 enum freezer_state state;
54 55
55 task_lock(task); 56 task_lock(task);
56 freezer = task_freezer(task); 57 freezer = task_freezer(task);
57 state = freezer->state; 58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
58 task_unlock(task); 62 task_unlock(task);
59 63
60 return state == CGROUP_FROZEN; 64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
61} 65}
62 66
63/* 67/*
@@ -85,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
85 89
86/* Locks taken and their ordering 90/* Locks taken and their ordering
87 * ------------------------------ 91 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
92 * task->sighand->siglock 96 * task->sighand->siglock
93 * 97 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -96,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
96 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
98 * 102 *
99 * can_attach(): 103 * freezer_can_attach():
100 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
101 * 105 *
102 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
103 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
104 * 108 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock 110 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
109 * 112 *
110 * freezer_read(): 113 * freezer_read():
111 * cgroup_mutex 114 * cgroup_mutex
112 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
113 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
114 * 119 *
115 * freezer_write() (freeze): 120 * freezer_write() (freeze):
116 * cgroup_mutex 121 * cgroup_mutex
117 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
118 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
120 * 127 *
121 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
122 * cgroup_mutex 129 * cgroup_mutex
123 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
124 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
126 * sighand->siglock 135 * sighand->siglock
127 */ 136 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -201,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
201 * No lock is needed, since the task isn't on tasklist yet, 210 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the 211 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this 212 * freezer won't be removed and will be valid during this
204 * function call. 213 * function call. Nevertheless, apply RCU read-side critical
214 * section to suppress RCU lockdep false positives.
205 */ 215 */
216 rcu_read_lock();
206 freezer = task_freezer(task); 217 freezer = task_freezer(task);
218 rcu_read_unlock();
207 219
208 /* 220 /*
209 * The root cgroup is non-freezable, so we can skip the 221 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/gfp.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30 31
@@ -494,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
494{ 495{
495 int ret; 496 int ret;
496 cpumask_var_t mask; 497 cpumask_var_t mask;
497 unsigned long *k;
498 unsigned int min_length = cpumask_size();
499
500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
501 min_length = sizeof(compat_ulong_t);
502 498
503 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
504 return -EINVAL; 502 return -EINVAL;
505 503
506 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM; 505 return -ENOMEM;
508 506
509 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
510 if (ret < 0) 508 if (ret == 0) {
511 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
512 510
513 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
515 if (ret == 0) 513 else
516 ret = min_length; 514 ret = retlen;
517 515 }
518out:
519 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
520 return ret; 518 return ret;
521} 519}
522 520
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,18 +14,35 @@
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h>
17 18
18#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
19/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20static DEFINE_MUTEX(cpu_add_remove_lock); 21static DEFINE_MUTEX(cpu_add_remove_lock);
21 22
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 23/*
24 * The following two API's must be used when attempting
25 * to serialize the updates to cpu_online_mask, cpu_present_mask.
26 */
27void cpu_maps_update_begin(void)
28{
29 mutex_lock(&cpu_add_remove_lock);
30}
31
32void cpu_maps_update_done(void)
33{
34 mutex_unlock(&cpu_add_remove_lock);
35}
36
37static RAW_NOTIFIER_HEAD(cpu_chain);
23 38
24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
25 * Should always be manipulated under cpu_add_remove_lock 40 * Should always be manipulated under cpu_add_remove_lock
26 */ 41 */
27static int cpu_hotplug_disabled; 42static int cpu_hotplug_disabled;
28 43
44#ifdef CONFIG_HOTPLUG_CPU
45
29static struct { 46static struct {
30 struct task_struct *active_writer; 47 struct task_struct *active_writer;
31 struct mutex lock; /* Synchronizes accesses to refcount, */ 48 struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -40,8 +57,6 @@ static struct {
40 .refcount = 0, 57 .refcount = 0,
41}; 58};
42 59
43#ifdef CONFIG_HOTPLUG_CPU
44
45void get_online_cpus(void) 60void get_online_cpus(void)
46{ 61{
47 might_sleep(); 62 might_sleep();
@@ -66,22 +81,6 @@ void put_online_cpus(void)
66} 81}
67EXPORT_SYMBOL_GPL(put_online_cpus); 82EXPORT_SYMBOL_GPL(put_online_cpus);
68 83
69#endif /* CONFIG_HOTPLUG_CPU */
70
71/*
72 * The following two API's must be used when attempting
73 * to serialize the updates to cpu_online_mask, cpu_present_mask.
74 */
75void cpu_maps_update_begin(void)
76{
77 mutex_lock(&cpu_add_remove_lock);
78}
79
80void cpu_maps_update_done(void)
81{
82 mutex_unlock(&cpu_add_remove_lock);
83}
84
85/* 84/*
86 * This ensures that the hotplug operation can begin only when the 85 * This ensures that the hotplug operation can begin only when the
87 * refcount goes to zero. 86 * refcount goes to zero.
@@ -123,6 +122,12 @@ static void cpu_hotplug_done(void)
123 cpu_hotplug.active_writer = NULL; 122 cpu_hotplug.active_writer = NULL;
124 mutex_unlock(&cpu_hotplug.lock); 123 mutex_unlock(&cpu_hotplug.lock);
125} 124}
125
126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */
130
126/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
127int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
128{ 133{
@@ -133,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
133 return ret; 138 return ret;
134} 139}
135 140
141static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
142 int *nr_calls)
143{
144 int ret;
145
146 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
147 nr_calls);
148
149 return notifier_to_errno(ret);
150}
151
152static int cpu_notify(unsigned long val, void *v)
153{
154 return __cpu_notify(val, v, -1, NULL);
155}
156
136#ifdef CONFIG_HOTPLUG_CPU 157#ifdef CONFIG_HOTPLUG_CPU
137 158
159static void cpu_notify_nofail(unsigned long val, void *v)
160{
161 BUG_ON(cpu_notify(val, v));
162}
163
138EXPORT_SYMBOL(register_cpu_notifier); 164EXPORT_SYMBOL(register_cpu_notifier);
139 165
140void __ref unregister_cpu_notifier(struct notifier_block *nb) 166void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -151,18 +177,19 @@ static inline void check_for_tasks(int cpu)
151 177
152 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 179 for_each_process(p) {
154 if (task_cpu(p) == cpu && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 181 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 182 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 184 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 185 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 186 p->state, p->flags);
161 } 187 }
162 write_unlock_irq(&tasklist_lock); 188 write_unlock_irq(&tasklist_lock);
163} 189}
164 190
165struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
166 unsigned long mod; 193 unsigned long mod;
167 void *hcpu; 194 void *hcpu;
168}; 195};
@@ -171,6 +198,7 @@ struct take_cpu_down_param {
171static int __ref take_cpu_down(void *_param) 198static int __ref take_cpu_down(void *_param)
172{ 199{
173 struct take_cpu_down_param *param = _param; 200 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
174 int err; 202 int err;
175 203
176 /* Ensure this CPU doesn't handle any more interrupts. */ 204 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -178,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
178 if (err < 0) 206 if (err < 0)
179 return err; 207 return err;
180 208
181 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 209 cpu_notify(CPU_DYING | param->mod, param->hcpu);
182 param->hcpu);
183 210
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
184 /* Force idle task to run as soon as we yield: it should 213 /* Force idle task to run as soon as we yield: it should
185 immediately notice cpu is offline and die quickly. */ 214 immediately notice cpu is offline and die quickly. */
186 sched_idle_next(); 215 sched_idle_next();
@@ -191,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
191static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 220static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
192{ 221{
193 int err, nr_calls = 0; 222 int err, nr_calls = 0;
194 cpumask_var_t old_allowed;
195 void *hcpu = (void *)(long)cpu; 223 void *hcpu = (void *)(long)cpu;
196 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
197 struct take_cpu_down_param tcd_param = { 225 struct take_cpu_down_param tcd_param = {
226 .caller = current,
198 .mod = mod, 227 .mod = mod,
199 .hcpu = hcpu, 228 .hcpu = hcpu,
200 }; 229 };
@@ -205,38 +234,26 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
205 if (!cpu_online(cpu)) 234 if (!cpu_online(cpu))
206 return -EINVAL; 235 return -EINVAL;
207 236
208 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
209 return -ENOMEM;
210
211 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
212 set_cpu_active(cpu, false); 238 set_cpu_active(cpu, false);
213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
214 hcpu, -1, &nr_calls); 240 if (err) {
215 if (err == NOTIFY_BAD) {
216 set_cpu_active(cpu, true); 241 set_cpu_active(cpu, true);
217 242
218 nr_calls--; 243 nr_calls--;
219 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
220 hcpu, nr_calls, NULL);
221 printk("%s: attempt to take down CPU %u failed\n", 245 printk("%s: attempt to take down CPU %u failed\n",
222 __func__, cpu); 246 __func__, cpu);
223 err = -EINVAL;
224 goto out_release; 247 goto out_release;
225 } 248 }
226 249
227 /* Ensure that we are not runnable on dying cpu */
228 cpumask_copy(old_allowed, &current->cpus_allowed);
229 set_cpus_allowed_ptr(current, cpu_active_mask);
230
231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
232 if (err) { 251 if (err) {
233 set_cpu_active(cpu, true); 252 set_cpu_active(cpu, true);
234 /* CPU didn't die: tell everyone. Can't complain. */ 253 /* CPU didn't die: tell everyone. Can't complain. */
235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
236 hcpu) == NOTIFY_BAD)
237 BUG();
238 255
239 goto out_allowed; 256 goto out_release;
240 } 257 }
241 BUG_ON(cpu_online(cpu)); 258 BUG_ON(cpu_online(cpu));
242 259
@@ -248,22 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
248 __cpu_die(cpu); 265 __cpu_die(cpu);
249 266
250 /* CPU is completely dead: tell everyone. Too late to complain. */ 267 /* CPU is completely dead: tell everyone. Too late to complain. */
251 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, 268 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
252 hcpu) == NOTIFY_BAD)
253 BUG();
254 269
255 check_for_tasks(cpu); 270 check_for_tasks(cpu);
256 271
257out_allowed:
258 set_cpus_allowed_ptr(current, old_allowed);
259out_release: 272out_release:
260 cpu_hotplug_done(); 273 cpu_hotplug_done();
261 if (!err) { 274 if (!err)
262 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, 275 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
263 hcpu) == NOTIFY_BAD)
264 BUG();
265 }
266 free_cpumask_var(old_allowed);
267 return err; 276 return err;
268} 277}
269 278
@@ -271,9 +280,6 @@ int __ref cpu_down(unsigned int cpu)
271{ 280{
272 int err; 281 int err;
273 282
274 err = stop_machine_create();
275 if (err)
276 return err;
277 cpu_maps_update_begin(); 283 cpu_maps_update_begin();
278 284
279 if (cpu_hotplug_disabled) { 285 if (cpu_hotplug_disabled) {
@@ -285,7 +291,6 @@ int __ref cpu_down(unsigned int cpu)
285 291
286out: 292out:
287 cpu_maps_update_done(); 293 cpu_maps_update_done();
288 stop_machine_destroy();
289 return err; 294 return err;
290} 295}
291EXPORT_SYMBOL(cpu_down); 296EXPORT_SYMBOL(cpu_down);
@@ -302,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
302 return -EINVAL; 307 return -EINVAL;
303 308
304 cpu_hotplug_begin(); 309 cpu_hotplug_begin();
305 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 310 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
306 -1, &nr_calls); 311 if (ret) {
307 if (ret == NOTIFY_BAD) {
308 nr_calls--; 312 nr_calls--;
309 printk("%s: attempt to bring up CPU %u failed\n", 313 printk("%s: attempt to bring up CPU %u failed\n",
310 __func__, cpu); 314 __func__, cpu);
311 ret = -EINVAL;
312 goto out_notify; 315 goto out_notify;
313 } 316 }
314 317
@@ -321,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 set_cpu_active(cpu, true); 324 set_cpu_active(cpu, true);
322 325
323 /* Now call notifier in preparation. */ 326 /* Now call notifier in preparation. */
324 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 327 cpu_notify(CPU_ONLINE | mod, hcpu);
325 328
326out_notify: 329out_notify:
327 if (ret != 0) 330 if (ret != 0)
328 __raw_notifier_call_chain(&cpu_chain, 331 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
329 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
330 cpu_hotplug_done(); 332 cpu_hotplug_done();
331 333
332 return ret; 334 return ret;
@@ -335,16 +337,44 @@ out_notify:
335int __cpuinit cpu_up(unsigned int cpu) 337int __cpuinit cpu_up(unsigned int cpu)
336{ 338{
337 int err = 0; 339 int err = 0;
340
341#ifdef CONFIG_MEMORY_HOTPLUG
342 int nid;
343 pg_data_t *pgdat;
344#endif
345
338 if (!cpu_possible(cpu)) { 346 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 347 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 348 "configured as may-hotadd at boot time\n", cpu);
341#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 349#if defined(CONFIG_IA64)
342 printk(KERN_ERR "please check additional_cpus= boot " 350 printk(KERN_ERR "please check additional_cpus= boot "
343 "parameter\n"); 351 "parameter\n");
344#endif 352#endif
345 return -EINVAL; 353 return -EINVAL;
346 } 354 }
347 355
356#ifdef CONFIG_MEMORY_HOTPLUG
357 nid = cpu_to_node(cpu);
358 if (!node_online(nid)) {
359 err = mem_online_node(nid);
360 if (err)
361 return err;
362 }
363
364 pgdat = NODE_DATA(nid);
365 if (!pgdat) {
366 printk(KERN_ERR
367 "Can't online cpu %d due to NULL pgdat\n", cpu);
368 return -ENOMEM;
369 }
370
371 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
372 mutex_lock(&zonelists_mutex);
373 build_all_zonelists(NULL);
374 mutex_unlock(&zonelists_mutex);
375 }
376#endif
377
348 cpu_maps_update_begin(); 378 cpu_maps_update_begin();
349 379
350 if (cpu_hotplug_disabled) { 380 if (cpu_hotplug_disabled) {
@@ -364,11 +394,8 @@ static cpumask_var_t frozen_cpus;
364 394
365int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
366{ 396{
367 int cpu, first_cpu, error; 397 int cpu, first_cpu, error = 0;
368 398
369 error = stop_machine_create();
370 if (error)
371 return error;
372 cpu_maps_update_begin(); 399 cpu_maps_update_begin();
373 first_cpu = cpumask_first(cpu_online_mask); 400 first_cpu = cpumask_first(cpu_online_mask);
374 /* 401 /*
@@ -399,7 +426,6 @@ int disable_nonboot_cpus(void)
399 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 426 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
400 } 427 }
401 cpu_maps_update_done(); 428 cpu_maps_update_done();
402 stop_machine_destroy();
403 return error; 429 return error;
404} 430}
405 431
@@ -466,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
466 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 492 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
467 val = CPU_STARTING_FROZEN; 493 val = CPU_STARTING_FROZEN;
468#endif /* CONFIG_PM_SLEEP_SMP */ 494#endif /* CONFIG_PM_SLEEP_SMP */
469 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 495 cpu_notify(val, (void *)(long)cpu);
470} 496}
471 497
472#endif /* CONFIG_SMP */ 498#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..02b9611eadde 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -949,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
950 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
951 * disallowed ones. 948 * disallowed ones.
952 *
953 * Called with task's alloc_lock held
954 */ 949 */
955static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
956 nodemask_t *newmems) 951 nodemask_t *newmems)
957{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
958 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
959 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
960 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
961 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
962} 1005}
963 1006
964/* 1007/*
@@ -973,14 +1016,17 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 1016 struct cpuset *cs;
974 int migrate; 1017 int migrate;
975 const nodemask_t *oldmem = scan->data; 1018 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 1019 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
1020
1021 if (!newmems)
1022 return;
977 1023
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p); 1027 cpuset_change_task_nodemask(p, newmems);
982 cpuset_change_task_nodemask(p, &newmems); 1028
983 task_unlock(p); 1029 NODEMASK_FREE(newmems);
984 1030
985 mm = get_task_mm(p); 1031 mm = get_task_mm(p);
986 if (!mm) 1032 if (!mm)
@@ -1051,16 +1097,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1097static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1098 const char *buf)
1053{ 1099{
1054 nodemask_t oldmem; 1100 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1101 int retval;
1056 struct ptr_heap heap; 1102 struct ptr_heap heap;
1057 1103
1104 if (!oldmem)
1105 return -ENOMEM;
1106
1058 /* 1107 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1108 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1109 * it's read-only
1061 */ 1110 */
1062 if (cs == &top_cpuset) 1111 if (cs == &top_cpuset) {
1063 return -EACCES; 1112 retval = -EACCES;
1113 goto done;
1114 }
1064 1115
1065 /* 1116 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1117 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1127,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1127 goto done;
1077 1128
1078 if (!nodes_subset(trialcs->mems_allowed, 1129 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1130 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1131 retval = -EINVAL;
1132 goto done;
1133 }
1081 } 1134 }
1082 oldmem = cs->mems_allowed; 1135 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1136 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1137 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1138 goto done;
1086 } 1139 }
@@ -1096,10 +1149,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1149 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1150 mutex_unlock(&callback_mutex);
1098 1151
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1152 update_tasks_nodemask(cs, oldmem, &heap);
1100 1153
1101 heap_free(&heap); 1154 heap_free(&heap);
1102done: 1155done:
1156 NODEMASK_FREE(oldmem);
1103 return retval; 1157 return retval;
1104} 1158}
1105 1159
@@ -1373,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1373 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1374 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1375 1429
1376 task_lock(tsk);
1377 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1378 task_unlock(tsk);
1379 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1380 1432
1381} 1433}
@@ -1384,40 +1436,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1436 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1437 bool threadgroup)
1386{ 1438{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1439 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1440 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1441 struct cpuset *oldcs = cgroup_cs(oldcont);
1442 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1443 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1444
1445 if (from == NULL || to == NULL)
1446 goto alloc_fail;
1391 1447
1392 if (cs == &top_cpuset) { 1448 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1450 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1451 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1452 }
1453 guarantee_online_mems(cs, to);
1399 1454
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1455 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1456 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1457 if (threadgroup) {
1403 struct task_struct *c; 1458 struct task_struct *c;
1404 rcu_read_lock(); 1459 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1460 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1461 cpuset_attach_task(c, to, cs);
1407 } 1462 }
1408 rcu_read_unlock(); 1463 rcu_read_unlock();
1409 } 1464 }
1410 1465
1411 /* change mm; only needs to be done once even if threadgroup */ 1466 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1467 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1468 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1469 mm = get_task_mm(tsk);
1415 if (mm) { 1470 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1471 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1472 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1473 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1474 mmput(mm);
1420 } 1475 }
1476
1477alloc_fail:
1478 NODEMASK_FREE(from);
1479 NODEMASK_FREE(to);
1421} 1480}
1422 1481
1423/* The various types of files and directories in a cpuset file system */ 1482/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1621,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1621
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1622static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1623{
1565 nodemask_t mask; 1624 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1625 int retval;
1626
1627 if (mask == NULL)
1628 return -ENOMEM;
1566 1629
1567 mutex_lock(&callback_mutex); 1630 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1631 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1632 mutex_unlock(&callback_mutex);
1570 1633
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1634 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1635
1636 NODEMASK_FREE(mask);
1637
1638 return retval;
1572} 1639}
1573 1640
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1641static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2064,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2064 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2065 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2066 struct cgroup *cont;
2000 nodemask_t oldmems; 2067 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2068
2069 if (oldmems == NULL)
2070 return;
2001 2071
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2072 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2073
@@ -2014,7 +2084,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2084 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2085 continue;
2016 2086
2017 oldmems = cp->mems_allowed; 2087 *oldmems = cp->mems_allowed;
2018 2088
2019 /* Remove offline cpus and mems from this cpuset. */ 2089 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2090 mutex_lock(&callback_mutex);
@@ -2030,9 +2100,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2100 remove_tasks_in_empty_cpuset(cp);
2031 else { 2101 else {
2032 update_tasks_cpumask(cp, NULL); 2102 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2103 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2104 }
2035 } 2105 }
2106 NODEMASK_FREE(oldmems);
2036} 2107}
2037 2108
2038/* 2109/*
@@ -2090,20 +2161,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2161static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2162 unsigned long action, void *arg)
2092{ 2163{
2164 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2165
2166 if (oldmems == NULL)
2167 return NOTIFY_DONE;
2168
2093 cgroup_lock(); 2169 cgroup_lock();
2094 switch (action) { 2170 switch (action) {
2095 case MEM_ONLINE: 2171 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2172 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2173 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2174 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2175 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2176 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2177 break;
2178 case MEM_OFFLINE:
2179 /*
2180 * needn't update top_cpuset.mems_allowed explicitly because
2181 * scan_for_empty_cpusets() will update it.
2182 */
2183 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2184 break;
2103 default: 2185 default:
2104 break; 2186 break;
2105 } 2187 }
2106 cgroup_unlock(); 2188 cgroup_unlock();
2189
2190 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2191 return NOTIFY_OK;
2108} 2192}
2109#endif 2193#endif
@@ -2140,19 +2224,52 @@ void __init cpuset_init_smp(void)
2140void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2224void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2141{ 2225{
2142 mutex_lock(&callback_mutex); 2226 mutex_lock(&callback_mutex);
2143 cpuset_cpus_allowed_locked(tsk, pmask); 2227 task_lock(tsk);
2228 guarantee_online_cpus(task_cs(tsk), pmask);
2229 task_unlock(tsk);
2144 mutex_unlock(&callback_mutex); 2230 mutex_unlock(&callback_mutex);
2145} 2231}
2146 2232
2147/** 2233int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2148 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2149 * Must be called with callback_mutex held.
2150 **/
2151void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2152{ 2234{
2153 task_lock(tsk); 2235 const struct cpuset *cs;
2154 guarantee_online_cpus(task_cs(tsk), pmask); 2236 int cpu;
2155 task_unlock(tsk); 2237
2238 rcu_read_lock();
2239 cs = task_cs(tsk);
2240 if (cs)
2241 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2242 rcu_read_unlock();
2243
2244 /*
2245 * We own tsk->cpus_allowed, nobody can change it under us.
2246 *
2247 * But we used cs && cs->cpus_allowed lockless and thus can
2248 * race with cgroup_attach_task() or update_cpumask() and get
2249 * the wrong tsk->cpus_allowed. However, both cases imply the
2250 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2251 * which takes task_rq_lock().
2252 *
2253 * If we are called after it dropped the lock we must see all
2254 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2255 * set any mask even if it is not right from task_cs() pov,
2256 * the pending set_cpus_allowed_ptr() will fix things.
2257 */
2258
2259 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2260 if (cpu >= nr_cpu_ids) {
2261 /*
2262 * Either tsk->cpus_allowed is wrong (see above) or it
2263 * is actually empty. The latter case is only possible
2264 * if we are racing with remove_tasks_in_empty_cpuset().
2265 * Like above we can temporary set any mask and rely on
2266 * set_cpus_allowed_ptr() as synchronization point.
2267 */
2268 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2269 cpu = cpumask_any(cpu_active_mask);
2270 }
2271
2272 return cpu;
2156} 2273}
2157 2274
2158void cpuset_init_current_mems_allowed(void) 2275void cpuset_init_current_mems_allowed(void)
@@ -2341,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2341} 2458}
2342 2459
2343/** 2460/**
2344 * cpuset_lock - lock out any changes to cpuset structures
2345 *
2346 * The out of memory (oom) code needs to mutex_lock cpusets
2347 * from being changed while it scans the tasklist looking for a
2348 * task in an overlapping cpuset. Expose callback_mutex via this
2349 * cpuset_lock() routine, so the oom code can lock it, before
2350 * locking the task list. The tasklist_lock is a spinlock, so
2351 * must be taken inside callback_mutex.
2352 */
2353
2354void cpuset_lock(void)
2355{
2356 mutex_lock(&callback_mutex);
2357}
2358
2359/**
2360 * cpuset_unlock - release lock on cpuset changes 2461 * cpuset_unlock - release lock on cpuset changes
2361 * 2462 *
2362 * Undo the lock taken in a previous cpuset_lock() call. 2463 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2368,7 +2469,8 @@ void cpuset_unlock(void)
2368} 2469}
2369 2470
2370/** 2471/**
2371 * cpuset_mem_spread_node() - On which node to begin search for a page 2472 * cpuset_mem_spread_node() - On which node to begin search for a file page
2473 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2372 * 2474 *
2373 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2475 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2374 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2476 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2393,16 +2495,27 @@ void cpuset_unlock(void)
2393 * See kmem_cache_alloc_node(). 2495 * See kmem_cache_alloc_node().
2394 */ 2496 */
2395 2497
2396int cpuset_mem_spread_node(void) 2498static int cpuset_spread_node(int *rotor)
2397{ 2499{
2398 int node; 2500 int node;
2399 2501
2400 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2502 node = next_node(*rotor, current->mems_allowed);
2401 if (node == MAX_NUMNODES) 2503 if (node == MAX_NUMNODES)
2402 node = first_node(current->mems_allowed); 2504 node = first_node(current->mems_allowed);
2403 current->cpuset_mem_spread_rotor = node; 2505 *rotor = node;
2404 return node; 2506 return node;
2405} 2507}
2508
2509int cpuset_mem_spread_node(void)
2510{
2511 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2512}
2513
2514int cpuset_slab_spread_node(void)
2515{
2516 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2517}
2518
2406EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2519EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2407 2520
2408/** 2521/**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,13 +10,13 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/key.h> 15#include <linux/key.h>
15#include <linux/keyctl.h> 16#include <linux/keyctl.h>
16#include <linux/init_task.h> 17#include <linux/init_task.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
19#include "cred-internals.h"
20 20
21#if 0 21#if 0
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
209 } 209 }
210} 210}
211 211
212/**
213 * get_task_cred - Get another task's objective credentials
214 * @task: The task to query
215 *
216 * Get the objective credentials of a task, pinning them so that they can't go
217 * away. Accessing a task's credentials directly is not permitted.
218 *
219 * The caller must also make sure task doesn't get deleted, either by holding a
220 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
221 */
222const struct cred *get_task_cred(struct task_struct *task)
223{
224 const struct cred *cred;
225
226 rcu_read_lock();
227
228 do {
229 cred = __task_cred((task));
230 BUG_ON(!cred);
231 } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
232
233 rcu_read_unlock();
234 return cred;
235}
236
212/* 237/*
213 * Allocate blank credentials, such that the credentials can be filled in at a 238 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM. 239 * later date without risk of ENOMEM.
@@ -224,7 +249,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 249#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 250 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 251 if (!new->tgcred) {
227 kfree(new); 252 kmem_cache_free(cred_jar, new);
228 return NULL; 253 return NULL;
229 } 254 }
230 atomic_set(&new->tgcred->usage, 1); 255 atomic_set(&new->tgcred->usage, 1);
@@ -347,60 +372,6 @@ struct cred *prepare_exec_creds(void)
347} 372}
348 373
349/* 374/*
350 * prepare new credentials for the usermode helper dispatcher
351 */
352struct cred *prepare_usermodehelper_creds(void)
353{
354#ifdef CONFIG_KEYS
355 struct thread_group_cred *tgcred = NULL;
356#endif
357 struct cred *new;
358
359#ifdef CONFIG_KEYS
360 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
361 if (!tgcred)
362 return NULL;
363#endif
364
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new)
367 return NULL;
368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
371 memcpy(new, &init_cred, sizeof(struct cred));
372
373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
375 get_group_info(new->group_info);
376 get_uid(new->user);
377
378#ifdef CONFIG_KEYS
379 new->thread_keyring = NULL;
380 new->request_key_auth = NULL;
381 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
382
383 atomic_set(&tgcred->usage, 1);
384 spin_lock_init(&tgcred->lock);
385 new->tgcred = tgcred;
386#endif
387
388#ifdef CONFIG_SECURITY
389 new->security = NULL;
390#endif
391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
392 goto error;
393 validate_creds(new);
394
395 BUG_ON(atomic_read(&new->usage) != 1);
396 return new;
397
398error:
399 put_cred(new);
400 return NULL;
401}
402
403/*
404 * Copy credentials for the new process created by fork() 375 * Copy credentials for the new process created by fork()
405 * 376 *
406 * We share if we can, but under some circumstances we have to generate a new 377 * We share if we can, but under some circumstances we have to generate a new
@@ -516,8 +487,6 @@ int commit_creds(struct cred *new)
516#endif 487#endif
517 BUG_ON(atomic_read(&new->usage) < 1); 488 BUG_ON(atomic_read(&new->usage) < 1);
518 489
519 security_commit_creds(new, old);
520
521 get_cred(new); /* we will require a ref for the subj creds too */ 490 get_cred(new); /* we will require a ref for the subj creds too */
522 491
523 /* dumpability changes */ 492 /* dumpability changes */
@@ -553,8 +522,6 @@ int commit_creds(struct cred *new)
553 atomic_dec(&old->user->processes); 522 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2); 523 alter_cred_subscribers(old, -2);
555 524
556 sched_switch_user(task);
557
558 /* send notifications */ 525 /* send notifications */
559 if (new->uid != old->uid || 526 if (new->uid != old->uid ||
560 new->euid != old->euid || 527 new->euid != old->euid ||
@@ -786,8 +753,6 @@ bool creds_are_invalid(const struct cred *cred)
786{ 753{
787 if (cred->magic != CRED_MAGIC) 754 if (cred->magic != CRED_MAGIC)
788 return true; 755 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX 756#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) { 757 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE) 758 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel debugger
3#
4
5obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
6obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..8bc5eeffec8a
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,983 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/string.h>
41#include <linux/delay.h>
42#include <linux/sched.h>
43#include <linux/sysrq.h>
44#include <linux/init.h>
45#include <linux/kgdb.h>
46#include <linux/kdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56#include "debug_core.h"
57
58static int kgdb_break_asap;
59
60struct debuggerinfo_struct kgdb_info[NR_CPUS];
61
62/**
63 * kgdb_connected - Is a host GDB connected to us?
64 */
65int kgdb_connected;
66EXPORT_SYMBOL_GPL(kgdb_connected);
67
68/* All the KGDB handlers are installed */
69int kgdb_io_module_registered;
70
71/* Guard for recursive entry */
72static int exception_level;
73
74struct kgdb_io *dbg_io_ops;
75static DEFINE_SPINLOCK(kgdb_registration_lock);
76
77/* kgdb console driver is loaded */
78static int kgdb_con_registered;
79/* determine if kgdb console output should be used */
80static int kgdb_use_con;
81/* Flag for alternate operations for early debugging */
82bool dbg_is_early = true;
83/* Next cpu to become the master debug core */
84int dbg_switch_cpu;
85
86/* Use kdb or gdbserver mode */
87int dbg_kdb_mode = 1;
88
89static int __init opt_kgdb_con(char *str)
90{
91 kgdb_use_con = 1;
92 return 0;
93}
94
95early_param("kgdbcon", opt_kgdb_con);
96
97module_param(kgdb_use_con, int, 0644);
98
99/*
100 * Holds information about breakpoints in a kernel. These breakpoints are
101 * added and removed by gdb.
102 */
103static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
104 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
105};
106
107/*
108 * The CPU# of the active CPU, or -1 if none:
109 */
110atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active);
112
113/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet):
116 */
117static atomic_t passive_cpu_wait[NR_CPUS];
118static atomic_t cpu_in_kgdb[NR_CPUS];
119static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint;
121
122struct task_struct *kgdb_usethread;
123struct task_struct *kgdb_contthread;
124
125int kgdb_single_step;
126static pid_t kgdb_sstep_pid;
127
128/* to keep track of the CPU which is doing the single stepping*/
129atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
130
131/*
132 * If you are debugging a problem where roundup (the collection of
133 * all other CPUs) is a problem [this should be extremely rare],
134 * then use the nokgdbroundup option to avoid roundup. In that case
135 * the other CPUs might interfere with your debugging context, so
136 * use this with care:
137 */
138static int kgdb_do_roundup = 1;
139
140static int __init opt_nokgdbroundup(char *str)
141{
142 kgdb_do_roundup = 0;
143
144 return 0;
145}
146
147early_param("nokgdbroundup", opt_nokgdbroundup);
148
149/*
150 * Finally, some KGDB code :-)
151 */
152
153/*
154 * Weak aliases for breakpoint management,
155 * can be overriden by architectures when needed:
156 */
157int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
158{
159 int err;
160
161 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
162 if (err)
163 return err;
164
165 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
166 BREAK_INSTR_SIZE);
167}
168
169int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
170{
171 return probe_kernel_write((char *)addr,
172 (char *)bundle, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_validate_break_address(unsigned long addr)
176{
177 char tmp_variable[BREAK_INSTR_SIZE];
178 int err;
179 /* Validate setting the breakpoint and then removing it. In the
180 * remove fails, the kernel needs to emit a bad message because we
181 * are deep trouble not being able to put things back the way we
182 * found them.
183 */
184 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
185 if (err)
186 return err;
187 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
188 if (err)
189 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
190 "memory destroyed at: %lx", addr);
191 return err;
192}
193
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{
196 return instruction_pointer(regs);
197}
198
199int __weak kgdb_arch_init(void)
200{
201 return 0;
202}
203
204int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
205{
206 return 0;
207}
208
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/*
222 * Some architectures need cache flushes when we set/clear a
223 * breakpoint:
224 */
225static void kgdb_flush_swbreak_addr(unsigned long addr)
226{
227 if (!CACHE_FLUSH_IS_SAFE)
228 return;
229
230 if (current->mm && current->mm->mmap_cache) {
231 flush_cache_range(current->mm->mmap_cache,
232 addr, addr + BREAK_INSTR_SIZE);
233 }
234 /* Force flush instruction cache if it was outside the mm */
235 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
236}
237
238/*
239 * SW breakpoint management:
240 */
241int dbg_activate_sw_breakpoints(void)
242{
243 unsigned long addr;
244 int error;
245 int ret = 0;
246 int i;
247
248 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
249 if (kgdb_break[i].state != BP_SET)
250 continue;
251
252 addr = kgdb_break[i].bpt_addr;
253 error = kgdb_arch_set_breakpoint(addr,
254 kgdb_break[i].saved_instr);
255 if (error) {
256 ret = error;
257 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
258 continue;
259 }
260
261 kgdb_flush_swbreak_addr(addr);
262 kgdb_break[i].state = BP_ACTIVE;
263 }
264 return ret;
265}
266
267int dbg_set_sw_break(unsigned long addr)
268{
269 int err = kgdb_validate_break_address(addr);
270 int breakno = -1;
271 int i;
272
273 if (err)
274 return err;
275
276 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
277 if ((kgdb_break[i].state == BP_SET) &&
278 (kgdb_break[i].bpt_addr == addr))
279 return -EEXIST;
280 }
281 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
282 if (kgdb_break[i].state == BP_REMOVED &&
283 kgdb_break[i].bpt_addr == addr) {
284 breakno = i;
285 break;
286 }
287 }
288
289 if (breakno == -1) {
290 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
291 if (kgdb_break[i].state == BP_UNDEFINED) {
292 breakno = i;
293 break;
294 }
295 }
296 }
297
298 if (breakno == -1)
299 return -E2BIG;
300
301 kgdb_break[breakno].state = BP_SET;
302 kgdb_break[breakno].type = BP_BREAKPOINT;
303 kgdb_break[breakno].bpt_addr = addr;
304
305 return 0;
306}
307
308int dbg_deactivate_sw_breakpoints(void)
309{
310 unsigned long addr;
311 int error;
312 int ret = 0;
313 int i;
314
315 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
316 if (kgdb_break[i].state != BP_ACTIVE)
317 continue;
318 addr = kgdb_break[i].bpt_addr;
319 error = kgdb_arch_remove_breakpoint(addr,
320 kgdb_break[i].saved_instr);
321 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
323 ret = error;
324 }
325
326 kgdb_flush_swbreak_addr(addr);
327 kgdb_break[i].state = BP_SET;
328 }
329 return ret;
330}
331
332int dbg_remove_sw_break(unsigned long addr)
333{
334 int i;
335
336 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
337 if ((kgdb_break[i].state == BP_SET) &&
338 (kgdb_break[i].bpt_addr == addr)) {
339 kgdb_break[i].state = BP_REMOVED;
340 return 0;
341 }
342 }
343 return -ENOENT;
344}
345
346int kgdb_isremovedbreak(unsigned long addr)
347{
348 int i;
349
350 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
351 if ((kgdb_break[i].state == BP_REMOVED) &&
352 (kgdb_break[i].bpt_addr == addr))
353 return 1;
354 }
355 return 0;
356}
357
358int dbg_remove_all_break(void)
359{
360 unsigned long addr;
361 int error;
362 int i;
363
364 /* Clear memory breakpoints. */
365 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
366 if (kgdb_break[i].state != BP_ACTIVE)
367 goto setundefined;
368 addr = kgdb_break[i].bpt_addr;
369 error = kgdb_arch_remove_breakpoint(addr,
370 kgdb_break[i].saved_instr);
371 if (error)
372 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
373 addr);
374setundefined:
375 kgdb_break[i].state = BP_UNDEFINED;
376 }
377
378 /* Clear hardware breakpoints. */
379 if (arch_kgdb_ops.remove_all_hw_break)
380 arch_kgdb_ops.remove_all_hw_break();
381
382 return 0;
383}
384
385/*
386 * Return true if there is a valid kgdb I/O module. Also if no
387 * debugger is attached a message can be printed to the console about
388 * waiting for the debugger to attach.
389 *
390 * The print_wait argument is only to be true when called from inside
391 * the core kgdb_handle_exception, because it will wait for the
392 * debugger to attach.
393 */
394static int kgdb_io_ready(int print_wait)
395{
396 if (!dbg_io_ops)
397 return 0;
398 if (kgdb_connected)
399 return 1;
400 if (atomic_read(&kgdb_setting_breakpoint))
401 return 1;
402 if (print_wait) {
403#ifdef CONFIG_KGDB_KDB
404 if (!dbg_kdb_mode)
405 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
406#else
407 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
408#endif
409 }
410 return 1;
411}
412
413static int kgdb_reenter_check(struct kgdb_state *ks)
414{
415 unsigned long addr;
416
417 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
418 return 0;
419
420 /* Panic on recursive debugger calls: */
421 exception_level++;
422 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
423 dbg_deactivate_sw_breakpoints();
424
425 /*
426 * If the break point removed ok at the place exception
427 * occurred, try to recover and print a warning to the end
428 * user because the user planted a breakpoint in a place that
429 * KGDB needs in order to function.
430 */
431 if (dbg_remove_sw_break(addr) == 0) {
432 exception_level = 0;
433 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
434 dbg_activate_sw_breakpoints();
435 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
436 addr);
437 WARN_ON_ONCE(1);
438
439 return 1;
440 }
441 dbg_remove_all_break();
442 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
443
444 if (exception_level > 1) {
445 dump_stack();
446 panic("Recursive entry to debugger");
447 }
448
449 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
450#ifdef CONFIG_KGDB_KDB
451 /* Allow kdb to debug itself one level */
452 return 0;
453#endif
454 dump_stack();
455 panic("Recursive entry to debugger");
456
457 return 1;
458}
459
460static void dbg_cpu_switch(int cpu, int next_cpu)
461{
462 /* Mark the cpu we are switching away from as a slave when it
463 * holds the kgdb_active token. This must be done so that the
464 * that all the cpus wait in for the debug core will not enter
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471}
472
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
474{
475 unsigned long flags;
476 int sstep_tries = 100;
477 int error;
478 int i, cpu;
479 int trace_on = 0;
480acquirelock:
481 /*
482 * Interrupts will be restored by the 'trap return' code, except when
483 * single stepping.
484 */
485 local_irq_save(flags);
486
487 cpu = ks->cpu;
488 kgdb_info[cpu].debuggerinfo = regs;
489 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497
498 if (exception_level == 1)
499 goto cpu_master_loop;
500
501 /*
502 * CPU will loop if it is a slave or request to become a kgdb
503 * master cpu and acquire the kgdb_active lock:
504 */
505 while (1) {
506cpu_loop:
507 if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
512 break;
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu]))
515 goto return_normal;
516 } else {
517return_normal:
518 /* Return to normal operation by executing any
519 * hw breakpoint fixup.
520 */
521 if (arch_kgdb_ops.correct_hw_break)
522 arch_kgdb_ops.correct_hw_break();
523 if (trace_on)
524 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]);
526 touch_softlockup_watchdog_sync();
527 clocksource_touch_watchdog();
528 local_irq_restore(flags);
529 return 0;
530 }
531 cpu_relax();
532 }
533
534 /*
535 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the
537 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on.
539 */
540 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
541 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync();
545 clocksource_touch_watchdog();
546 local_irq_restore(flags);
547
548 goto acquirelock;
549 }
550
551 if (!kgdb_io_ready(1)) {
552 kgdb_info[cpu].ret_state = 1;
553 goto kgdb_restore; /* No I/O connection, resume the system */
554 }
555
556 /*
557 * Don't enter if we have hit a removed breakpoint.
558 */
559 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
560 goto kgdb_restore;
561
562 /* Call the I/O driver's pre_exception routine */
563 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception();
565
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /*
569 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active
571 */
572 if (!kgdb_single_step) {
573 for (i = 0; i < NR_CPUS; i++)
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576
577#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags);
581#endif
582
583 /*
584 * Wait for the other CPUs to be notified and be waiting for us:
585 */
586 for_each_online_cpu(i) {
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
588 cpu_relax();
589 }
590
591 /*
592 * At this point the primary processor is completely
593 * in the debugger and all secondary CPUs are quiescent
594 */
595 dbg_deactivate_sw_breakpoints();
596 kgdb_single_step = 0;
597 kgdb_contthread = current;
598 exception_level = 0;
599 trace_on = tracing_is_on();
600 if (trace_on)
601 tracing_off();
602
603 while (1) {
604cpu_master_loop:
605 if (dbg_kdb_mode) {
606 kgdb_connected = 1;
607 error = kdb_stub(ks);
608 kgdb_connected = 0;
609 } else {
610 error = gdb_serial_stub(ks);
611 }
612
613 if (error == DBG_PASS_EVENT) {
614 dbg_kdb_mode = !dbg_kdb_mode;
615 } else if (error == DBG_SWITCH_CPU_EVENT) {
616 dbg_cpu_switch(cpu, dbg_switch_cpu);
617 goto cpu_loop;
618 } else {
619 kgdb_info[cpu].ret_state = error;
620 break;
621 }
622 }
623
624 /* Call the I/O driver's post_exception routine */
625 if (dbg_io_ops->post_exception)
626 dbg_io_ops->post_exception();
627
628 atomic_dec(&cpu_in_kgdb[ks->cpu]);
629
630 if (!kgdb_single_step) {
631 for (i = NR_CPUS-1; i >= 0; i--)
632 atomic_dec(&passive_cpu_wait[i]);
633 /*
634 * Wait till all the CPUs have quit from the debugger,
635 * but allow a CPU that hit an exception and is
636 * waiting to become the master to remain in the debug
637 * core.
638 */
639 for_each_online_cpu(i) {
640 while (kgdb_do_roundup &&
641 atomic_read(&cpu_in_kgdb[i]) &&
642 !(kgdb_info[i].exception_state &
643 DCPU_WANT_MASTER))
644 cpu_relax();
645 }
646 }
647
648kgdb_restore:
649 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
650 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
651 if (kgdb_info[sstep_cpu].task)
652 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
653 else
654 kgdb_sstep_pid = 0;
655 }
656 if (trace_on)
657 tracing_on();
658 /* Free kgdb_active */
659 atomic_set(&kgdb_active, -1);
660 touch_softlockup_watchdog_sync();
661 clocksource_touch_watchdog();
662 local_irq_restore(flags);
663
664 return kgdb_info[cpu].ret_state;
665}
666
667/*
668 * kgdb_handle_exception() - main entry point from a kernel exception
669 *
670 * Locking hierarchy:
671 * interface locks, if any (begin_session)
672 * kgdb lock (kgdb_active)
673 */
674int
675kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
676{
677 struct kgdb_state kgdb_var;
678 struct kgdb_state *ks = &kgdb_var;
679 int ret;
680
681 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector;
683 ks->signo = signo;
684 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs;
687
688 if (kgdb_reenter_check(ks))
689 return 0; /* Ouch, double exception ! */
690 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
691 ret = kgdb_cpu_enter(ks, regs);
692 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
693 DCPU_IS_SLAVE);
694 return ret;
695}
696
697int kgdb_nmicallback(int cpu, void *regs)
698{
699#ifdef CONFIG_SMP
700 struct kgdb_state kgdb_var;
701 struct kgdb_state *ks = &kgdb_var;
702
703 memset(ks, 0, sizeof(struct kgdb_state));
704 ks->cpu = cpu;
705 ks->linux_regs = regs;
706
707 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
708 atomic_read(&kgdb_active) != -1 &&
709 atomic_read(&kgdb_active) != cpu) {
710 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
711 kgdb_cpu_enter(ks, regs);
712 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
713 return 0;
714 }
715#endif
716 return 1;
717}
718
719static void kgdb_console_write(struct console *co, const char *s,
720 unsigned count)
721{
722 unsigned long flags;
723
724 /* If we're debugging, or KGDB has not connected, don't try
725 * and print. */
726 if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
727 return;
728
729 local_irq_save(flags);
730 gdbstub_msg_write(s, count);
731 local_irq_restore(flags);
732}
733
734static struct console kgdbcons = {
735 .name = "kgdb",
736 .write = kgdb_console_write,
737 .flags = CON_PRINTBUFFER | CON_ENABLED,
738 .index = -1,
739};
740
741#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty)
743{
744 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
746 return;
747 }
748 if (!kgdb_connected) {
749#ifdef CONFIG_KGDB_KDB
750 if (!dbg_kdb_mode)
751 printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
752#else
753 printk(KERN_CRIT "Entering KGDB\n");
754#endif
755 }
756
757 kgdb_breakpoint();
758}
759
760static struct sysrq_key_op sysrq_dbg_op = {
761 .handler = sysrq_handle_dbg,
762 .help_msg = "debug(G)",
763 .action_msg = "DEBUG",
764};
765#endif
766
767static int kgdb_panic_event(struct notifier_block *self,
768 unsigned long val,
769 void *data)
770{
771 if (dbg_kdb_mode)
772 kdb_printf("PANIC: %s\n", (char *)data);
773 kgdb_breakpoint();
774 return NOTIFY_DONE;
775}
776
777static struct notifier_block kgdb_panic_event_nb = {
778 .notifier_call = kgdb_panic_event,
779 .priority = INT_MAX,
780};
781
782void __weak kgdb_arch_late(void)
783{
784}
785
786void __init dbg_late_init(void)
787{
788 dbg_is_early = false;
789 if (kgdb_io_module_registered)
790 kgdb_arch_late();
791 kdb_init(KDB_INIT_FULL);
792}
793
794static void kgdb_register_callbacks(void)
795{
796 if (!kgdb_io_module_registered) {
797 kgdb_io_module_registered = 1;
798 kgdb_arch_init();
799 if (!dbg_is_early)
800 kgdb_arch_late();
801 atomic_notifier_chain_register(&panic_notifier_list,
802 &kgdb_panic_event_nb);
803#ifdef CONFIG_MAGIC_SYSRQ
804 register_sysrq_key('g', &sysrq_dbg_op);
805#endif
806 if (kgdb_use_con && !kgdb_con_registered) {
807 register_console(&kgdbcons);
808 kgdb_con_registered = 1;
809 }
810 }
811}
812
813static void kgdb_unregister_callbacks(void)
814{
815 /*
816 * When this routine is called KGDB should unregister from the
817 * panic handler and clean up, making sure it is not handling any
818 * break exceptions at the time.
819 */
820 if (kgdb_io_module_registered) {
821 kgdb_io_module_registered = 0;
822 atomic_notifier_chain_unregister(&panic_notifier_list,
823 &kgdb_panic_event_nb);
824 kgdb_arch_exit();
825#ifdef CONFIG_MAGIC_SYSRQ
826 unregister_sysrq_key('g', &sysrq_dbg_op);
827#endif
828 if (kgdb_con_registered) {
829 unregister_console(&kgdbcons);
830 kgdb_con_registered = 0;
831 }
832 }
833}
834
835/*
836 * There are times a tasklet needs to be used vs a compiled in
837 * break point so as to cause an exception outside a kgdb I/O module,
838 * such as is the case with kgdboe, where calling a breakpoint in the
839 * I/O driver itself would be fatal.
840 */
841static void kgdb_tasklet_bpt(unsigned long ing)
842{
843 kgdb_breakpoint();
844 atomic_set(&kgdb_break_tasklet_var, 0);
845}
846
847static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
848
849void kgdb_schedule_breakpoint(void)
850{
851 if (atomic_read(&kgdb_break_tasklet_var) ||
852 atomic_read(&kgdb_active) != -1 ||
853 atomic_read(&kgdb_setting_breakpoint))
854 return;
855 atomic_inc(&kgdb_break_tasklet_var);
856 tasklet_schedule(&kgdb_tasklet_breakpoint);
857}
858EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
859
860static void kgdb_initial_breakpoint(void)
861{
862 kgdb_break_asap = 0;
863
864 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
865 kgdb_breakpoint();
866}
867
868/**
869 * kgdb_register_io_module - register KGDB IO module
870 * @new_dbg_io_ops: the io ops vector
871 *
872 * Register it with the KGDB core.
873 */
874int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
875{
876 int err;
877
878 spin_lock(&kgdb_registration_lock);
879
880 if (dbg_io_ops) {
881 spin_unlock(&kgdb_registration_lock);
882
883 printk(KERN_ERR "kgdb: Another I/O driver is already "
884 "registered with KGDB.\n");
885 return -EBUSY;
886 }
887
888 if (new_dbg_io_ops->init) {
889 err = new_dbg_io_ops->init();
890 if (err) {
891 spin_unlock(&kgdb_registration_lock);
892 return err;
893 }
894 }
895
896 dbg_io_ops = new_dbg_io_ops;
897
898 spin_unlock(&kgdb_registration_lock);
899
900 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
901 new_dbg_io_ops->name);
902
903 /* Arm KGDB now. */
904 kgdb_register_callbacks();
905
906 if (kgdb_break_asap)
907 kgdb_initial_breakpoint();
908
909 return 0;
910}
911EXPORT_SYMBOL_GPL(kgdb_register_io_module);
912
913/**
914 * kkgdb_unregister_io_module - unregister KGDB IO module
915 * @old_dbg_io_ops: the io ops vector
916 *
917 * Unregister it with the KGDB core.
918 */
919void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
920{
921 BUG_ON(kgdb_connected);
922
923 /*
924 * KGDB is no longer able to communicate out, so
925 * unregister our callbacks and reset state.
926 */
927 kgdb_unregister_callbacks();
928
929 spin_lock(&kgdb_registration_lock);
930
931 WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
932 dbg_io_ops = NULL;
933
934 spin_unlock(&kgdb_registration_lock);
935
936 printk(KERN_INFO
937 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
938 old_dbg_io_ops->name);
939}
940EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
941
942int dbg_io_get_char(void)
943{
944 int ret = dbg_io_ops->read_char();
945 if (ret == NO_POLL_CHAR)
946 return -1;
947 if (!dbg_kdb_mode)
948 return ret;
949 if (ret == 127)
950 return 8;
951 return ret;
952}
953
954/**
955 * kgdb_breakpoint - generate breakpoint exception
956 *
957 * This function will generate a breakpoint exception. It is used at the
958 * beginning of a program to sync up with a debugger and can be used
959 * otherwise as a quick means to stop program execution and "break" into
960 * the debugger.
961 */
962void kgdb_breakpoint(void)
963{
964 atomic_inc(&kgdb_setting_breakpoint);
965 wmb(); /* Sync point before breakpoint */
966 arch_kgdb_breakpoint();
967 wmb(); /* Sync point after breakpoint */
968 atomic_dec(&kgdb_setting_breakpoint);
969}
970EXPORT_SYMBOL_GPL(kgdb_breakpoint);
971
972static int __init opt_kgdb_wait(char *str)
973{
974 kgdb_break_asap = 1;
975
976 kdb_init(KDB_INIT_EARLY);
977 if (kgdb_io_module_registered)
978 kgdb_initial_breakpoint();
979
980 return 0;
981}
982
983early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#ifndef _DEBUG_CORE_H_
12#define _DEBUG_CORE_H_
13/*
14 * These are the private implementation headers between the kernel
15 * debugger core and the debugger front end code.
16 */
17
18/* kernel debug core data structures */
19struct kgdb_state {
20 int ex_vector;
21 int signo;
22 int err_code;
23 int cpu;
24 int pass_exception;
25 unsigned long thr_query;
26 unsigned long threadid;
27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs;
29};
30
31/* Exception state values */
32#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
33#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
34#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
35#define DCPU_SSTEP 0x8 /* CPU is single stepping */
36
37struct debuggerinfo_struct {
38 void *debuggerinfo;
39 struct task_struct *task;
40 int exception_state;
41 int ret_state;
42 int irq_depth;
43};
44
45extern struct debuggerinfo_struct kgdb_info[];
46
47/* kernel debug core break point routines */
48extern int dbg_remove_all_break(void);
49extern int dbg_set_sw_break(unsigned long addr);
50extern int dbg_remove_sw_break(unsigned long addr);
51extern int dbg_activate_sw_breakpoints(void);
52extern int dbg_deactivate_sw_breakpoints(void);
53
54/* polled character access to i/o module */
55extern int dbg_io_get_char(void);
56
57/* stub return value for switching between the gdbstub and kdb */
58#define DBG_PASS_EVENT -12345
59/* Switch from one cpu to another */
60#define DBG_SWITCH_CPU_EVENT -123456
61extern int dbg_switch_cpu;
62
63/* gdbstub interface functions */
64extern int gdb_serial_stub(struct kgdb_state *ks);
65extern void gdbstub_msg_write(const char *s, int len);
66
67/* gdbstub functions used for kdb <-> gdbstub transition */
68extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
69extern int dbg_kdb_mode;
70
71#ifdef CONFIG_KGDB_KDB
72extern int kdb_stub(struct kgdb_state *ks);
73extern int kdb_parse(const char *cmdstr);
74#else /* ! CONFIG_KGDB_KDB */
75static inline int kdb_stub(struct kgdb_state *ks)
76{
77 return DBG_PASS_EVENT;
78}
79#endif /* CONFIG_KGDB_KDB */
80
81#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..e8fd6868682d
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1014 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30
31#include <linux/kernel.h>
32#include <linux/kgdb.h>
33#include <linux/kdb.h>
34#include <linux/reboot.h>
35#include <linux/uaccess.h>
36#include <asm/cacheflush.h>
37#include <asm/unaligned.h>
38#include "debug_core.h"
39
40#define KGDB_MAX_THREAD_QUERY 17
41
42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX];
45
46/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES +
48 sizeof(unsigned long) - 1) /
49 sizeof(unsigned long)];
50
51/*
52 * GDB remote protocol parser:
53 */
54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void)
68{
69 int ret = -1;
70 int i;
71
72 /* poll any additional I/O interfaces that are defined */
73 while (ret < 0)
74 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
75 ret = kdb_poll_funcs[i]();
76 if (ret > 0)
77 break;
78 }
79 return ret;
80}
81#else
82static int gdbstub_read_wait(void)
83{
84 int ret = dbg_io_ops->read_char();
85 while (ret == NO_POLL_CHAR)
86 ret = dbg_io_ops->read_char();
87 return ret;
88}
89#endif
90/* scan for the sequence $<data>#<checksum> */
91static void get_packet(char *buffer)
92{
93 unsigned char checksum;
94 unsigned char xmitcsum;
95 int count;
96 char ch;
97
98 do {
99 /*
100 * Spin and wait around for the start character, ignore all
101 * other characters:
102 */
103 while ((ch = (gdbstub_read_wait())) != '$')
104 /* nothing */;
105
106 kgdb_connected = 1;
107 checksum = 0;
108 xmitcsum = -1;
109
110 count = 0;
111
112 /*
113 * now, read until a # or end of buffer is found:
114 */
115 while (count < (BUFMAX - 1)) {
116 ch = gdbstub_read_wait();
117 if (ch == '#')
118 break;
119 checksum = checksum + ch;
120 buffer[count] = ch;
121 count = count + 1;
122 }
123 buffer[count] = 0;
124
125 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait());
128
129 if (checksum != xmitcsum)
130 /* failed checksum */
131 dbg_io_ops->write_char('-');
132 else
133 /* successful transfer */
134 dbg_io_ops->write_char('+');
135 if (dbg_io_ops->flush)
136 dbg_io_ops->flush();
137 }
138 } while (checksum != xmitcsum);
139}
140
141/*
142 * Send the packet in buffer.
143 * Check for gdb connection if asked for.
144 */
145static void put_packet(char *buffer)
146{
147 unsigned char checksum;
148 int count;
149 char ch;
150
151 /*
152 * $<packet info>#<checksum>.
153 */
154 while (1) {
155 dbg_io_ops->write_char('$');
156 checksum = 0;
157 count = 0;
158
159 while ((ch = buffer[count])) {
160 dbg_io_ops->write_char(ch);
161 checksum += ch;
162 count++;
163 }
164
165 dbg_io_ops->write_char('#');
166 dbg_io_ops->write_char(hex_asc_hi(checksum));
167 dbg_io_ops->write_char(hex_asc_lo(checksum));
168 if (dbg_io_ops->flush)
169 dbg_io_ops->flush();
170
171 /* Now see what we get in reply. */
172 ch = gdbstub_read_wait();
173
174 if (ch == 3)
175 ch = gdbstub_read_wait();
176
177 /* If we get an ACK, we are done. */
178 if (ch == '+')
179 return;
180
181 /*
182 * If we get the start of another packet, this means
183 * that GDB is attempting to reconnect. We will NAK
184 * the packet being sent, and stop trying to send this
185 * packet.
186 */
187 if (ch == '$') {
188 dbg_io_ops->write_char('-');
189 if (dbg_io_ops->flush)
190 dbg_io_ops->flush();
191 return;
192 }
193 }
194}
195
196static char gdbmsgbuf[BUFMAX + 1];
197
198void gdbstub_msg_write(const char *s, int len)
199{
200 char *bufptr;
201 int wcount;
202 int i;
203
204 if (len == 0)
205 len = strlen(s);
206
207 /* 'O'utput */
208 gdbmsgbuf[0] = 'O';
209
210 /* Fill and send buffers... */
211 while (len > 0) {
212 bufptr = gdbmsgbuf + 1;
213
214 /* Calculate how many this time */
215 if ((len << 1) > (BUFMAX - 2))
216 wcount = (BUFMAX - 2) >> 1;
217 else
218 wcount = len;
219
220 /* Pack in hex chars */
221 for (i = 0; i < wcount; i++)
222 bufptr = pack_hex_byte(bufptr, s[i]);
223 *bufptr = '\0';
224
225 /* Move up */
226 s += wcount;
227 len -= wcount;
228
229 /* Write packet */
230 put_packet(gdbmsgbuf);
231 }
232}
233
234/*
235 * Convert the memory pointed to by mem into hex, placing result in
236 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error.
238 */
239int kgdb_mem2hex(char *mem, char *buf, int count)
240{
241 char *tmp;
242 int err;
243
244 /*
245 * We use the upper half of buf as an intermediate buffer for the
246 * raw memory copy. Hex conversion will work against this one.
247 */
248 tmp = buf + count;
249
250 err = probe_kernel_read(tmp, mem, count);
251 if (!err) {
252 while (count > 0) {
253 buf = pack_hex_byte(buf, *tmp);
254 tmp++;
255 count--;
256 }
257
258 *buf = 0;
259 }
260
261 return err;
262}
263
264/*
265 * Convert the hex array pointed to by buf into binary to be placed in
266 * mem. Return a pointer to the character AFTER the last byte
267 * written. May return an error.
268 */
269int kgdb_hex2mem(char *buf, char *mem, int count)
270{
271 char *tmp_raw;
272 char *tmp_hex;
273
274 /*
275 * We use the upper half of buf as an intermediate buffer for the
276 * raw memory that is converted from hex.
277 */
278 tmp_raw = buf + count * 2;
279
280 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) {
282 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4;
285 }
286
287 return probe_kernel_write(mem, tmp_raw, count);
288}
289
290/*
291 * While we find nice hex chars, build a long_val.
292 * Return number of chars processed.
293 */
294int kgdb_hex2long(char **ptr, unsigned long *long_val)
295{
296 int hex_val;
297 int num = 0;
298 int negate = 0;
299
300 *long_val = 0;
301
302 if (**ptr == '-') {
303 negate = 1;
304 (*ptr)++;
305 }
306 while (**ptr) {
307 hex_val = hex(**ptr);
308 if (hex_val < 0)
309 break;
310
311 *long_val = (*long_val << 4) | hex_val;
312 num++;
313 (*ptr)++;
314 }
315
316 if (negate)
317 *long_val = -*long_val;
318
319 return num;
320}
321
322/*
323 * Copy the binary array pointed to by buf into mem. Fix $, #, and
324 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
325 * The input buf is overwitten with the result to write to mem.
326 */
327static int kgdb_ebin2mem(char *buf, char *mem, int count)
328{
329 int size = 0;
330 char *c = buf;
331
332 while (count-- > 0) {
333 c[size] = *buf++;
334 if (c[size] == 0x7d)
335 c[size] = *buf++ ^ 0x20;
336 size++;
337 }
338
339 return probe_kernel_write(mem, c, size);
340}
341
342/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary)
344{
345 char *ptr = &remcom_in_buffer[1];
346 unsigned long addr;
347 unsigned long length;
348 int err;
349
350 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
351 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
352 if (binary)
353 err = kgdb_ebin2mem(ptr, (char *)addr, length);
354 else
355 err = kgdb_hex2mem(ptr, (char *)addr, length);
356 if (err)
357 return err;
358 if (CACHE_FLUSH_IS_SAFE)
359 flush_icache_range(addr, addr + length);
360 return 0;
361 }
362
363 return -EINVAL;
364}
365
366static void error_packet(char *pkt, int error)
367{
368 error = -error;
369 pkt[0] = 'E';
370 pkt[1] = hex_asc[(error / 10)];
371 pkt[2] = hex_asc[(error % 10)];
372 pkt[3] = '\0';
373}
374
375/*
376 * Thread ID accessors. We represent a flat TID space to GDB, where
377 * the per CPU idle threads (which under Linux all have PID 0) are
378 * remapped to negative TIDs.
379 */
380
381#define BUF_THREAD_ID_SIZE 16
382
383static char *pack_threadid(char *pkt, unsigned char *id)
384{
385 char *limit;
386
387 limit = pkt + BUF_THREAD_ID_SIZE;
388 while (pkt < limit)
389 pkt = pack_hex_byte(pkt, *id++);
390
391 return pkt;
392}
393
394static void int_to_threadref(unsigned char *id, int value)
395{
396 unsigned char *scan;
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403}
404
405static struct task_struct *getthread(struct pt_regs *regs, int tid)
406{
407 /*
408 * Non-positive TIDs are remapped to the cpu shadow information
409 */
410 if (tid == 0 || tid == -1)
411 tid = -atomic_read(&kgdb_active) - 2;
412 if (tid < -1 && tid > -NR_CPUS - 2) {
413 if (kgdb_info[-tid - 2].task)
414 return kgdb_info[-tid - 2].task;
415 else
416 return idle_task(-tid - 2);
417 }
418 if (tid <= 0) {
419 printk(KERN_ERR "KGDB: Internal thread select error\n");
420 dump_stack();
421 return NULL;
422 }
423
424 /*
425 * find_task_by_pid_ns() does not take the tasklist lock anymore
426 * but is nicely RCU locked - hence is a pretty resilient
427 * thing to use:
428 */
429 return find_task_by_pid_ns(tid, &init_pid_ns);
430}
431
432
433/*
434 * Remap normal tasks to their real PID,
435 * CPU shadow threads are mapped to -CPU - 2
436 */
437static inline int shadow_pid(int realpid)
438{
439 if (realpid)
440 return realpid;
441
442 return -raw_smp_processor_id() - 2;
443}
444
445/*
446 * All the functions that start with gdb_cmd are the various
447 * operations to implement the handlers for the gdbserial protocol
448 * where KGDB is communicating with an external debugger
449 */
450
451/* Handle the '?' status packets */
452static void gdb_cmd_status(struct kgdb_state *ks)
453{
454 /*
455 * We know that this packet is only sent
456 * during initial connect. So to be safe,
457 * we clear out our breakpoints now in case
458 * GDB is reconnecting.
459 */
460 dbg_remove_all_break();
461
462 remcom_out_buffer[0] = 'S';
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464}
465
466/* Handle the 'g' get registers request */
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{
469 struct task_struct *thread;
470 void *local_debuggerinfo;
471 int i;
472
473 thread = kgdb_usethread;
474 if (!thread) {
475 thread = kgdb_info[ks->cpu].task;
476 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
477 } else {
478 local_debuggerinfo = NULL;
479 for_each_online_cpu(i) {
480 /*
481 * Try to find the task on some other
482 * or possibly this node if we do not
483 * find the matching task then we try
484 * to approximate the results.
485 */
486 if (thread == kgdb_info[i].task)
487 local_debuggerinfo = kgdb_info[i].debuggerinfo;
488 }
489 }
490
491 /*
492 * All threads that don't have debuggerinfo should be
493 * in schedule() sleeping, since all other CPUs
494 * are in kgdb_wait, and thus have debuggerinfo.
495 */
496 if (local_debuggerinfo) {
497 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
498 } else {
499 /*
500 * Pull stuff saved during switch_to; nothing
501 * else is accessible (or even particularly
502 * relevant).
503 *
504 * This should be enough for a stack trace.
505 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 }
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509}
510
511/* Handle the 'G' set registers request */
512static void gdb_cmd_setregs(struct kgdb_state *ks)
513{
514 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
515
516 if (kgdb_usethread && kgdb_usethread != current) {
517 error_packet(remcom_out_buffer, -EINVAL);
518 } else {
519 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
520 strcpy(remcom_out_buffer, "OK");
521 }
522}
523
524/* Handle the 'm' memory read bytes */
525static void gdb_cmd_memread(struct kgdb_state *ks)
526{
527 char *ptr = &remcom_in_buffer[1];
528 unsigned long length;
529 unsigned long addr;
530 int err;
531
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err)
536 error_packet(remcom_out_buffer, err);
537 } else {
538 error_packet(remcom_out_buffer, -EINVAL);
539 }
540}
541
542/* Handle the 'M' memory write bytes */
543static void gdb_cmd_memwrite(struct kgdb_state *ks)
544{
545 int err = write_mem_msg(0);
546
547 if (err)
548 error_packet(remcom_out_buffer, err);
549 else
550 strcpy(remcom_out_buffer, "OK");
551}
552
553/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{
556 int err = write_mem_msg(1);
557
558 if (err)
559 error_packet(remcom_out_buffer, err);
560 else
561 strcpy(remcom_out_buffer, "OK");
562}
563
564/* Handle the 'D' or 'k', detach or kill packets */
565static void gdb_cmd_detachkill(struct kgdb_state *ks)
566{
567 int error;
568
569 /* The detach case */
570 if (remcom_in_buffer[0] == 'D') {
571 error = dbg_remove_all_break();
572 if (error < 0) {
573 error_packet(remcom_out_buffer, error);
574 } else {
575 strcpy(remcom_out_buffer, "OK");
576 kgdb_connected = 0;
577 }
578 put_packet(remcom_out_buffer);
579 } else {
580 /*
581 * Assume the kill case, with no exit code checking,
582 * trying to force detach the debugger:
583 */
584 dbg_remove_all_break();
585 kgdb_connected = 0;
586 }
587}
588
589/* Handle the 'R' reboot packets */
590static int gdb_cmd_reboot(struct kgdb_state *ks)
591{
592 /* For now, only honor R0 */
593 if (strcmp(remcom_in_buffer, "R0") == 0) {
594 printk(KERN_CRIT "Executing emergency reboot\n");
595 strcpy(remcom_out_buffer, "OK");
596 put_packet(remcom_out_buffer);
597
598 /*
599 * Execution should not return from
600 * machine_emergency_restart()
601 */
602 machine_emergency_restart();
603 kgdb_connected = 0;
604
605 return 1;
606 }
607 return 0;
608}
609
610/* Handle the 'q' query packets */
611static void gdb_cmd_query(struct kgdb_state *ks)
612{
613 struct task_struct *g;
614 struct task_struct *p;
615 unsigned char thref[8];
616 char *ptr;
617 int i;
618 int cpu;
619 int finished = 0;
620
621 switch (remcom_in_buffer[1]) {
622 case 's':
623 case 'f':
624 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
625 break;
626
627 i = 0;
628 remcom_out_buffer[0] = 'm';
629 ptr = remcom_out_buffer + 1;
630 if (remcom_in_buffer[1] == 'f') {
631 /* Each cpu is a shadow thread */
632 for_each_online_cpu(cpu) {
633 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ',';
638 i++;
639 }
640 }
641
642 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ',';
648 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
650 finished = 1;
651 }
652 i++;
653 } while_each_thread(g, p);
654
655 *(--ptr) = '\0';
656 break;
657
658 case 'C':
659 /* Current thread id */
660 strcpy(remcom_out_buffer, "QC");
661 ks->threadid = shadow_pid(current->pid);
662 int_to_threadref(thref, ks->threadid);
663 pack_threadid(remcom_out_buffer + 2, thref);
664 break;
665 case 'T':
666 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
667 break;
668
669 ks->threadid = 0;
670 ptr = remcom_in_buffer + 17;
671 kgdb_hex2long(&ptr, &ks->threadid);
672 if (!getthread(ks->linux_regs, ks->threadid)) {
673 error_packet(remcom_out_buffer, -EINVAL);
674 break;
675 }
676 if ((int)ks->threadid > 0) {
677 kgdb_mem2hex(getthread(ks->linux_regs,
678 ks->threadid)->comm,
679 remcom_out_buffer, 16);
680 } else {
681 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
682
683 sprintf(tmpstr, "shadowCPU%d",
684 (int)(-ks->threadid - 2));
685 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
686 }
687 break;
688#ifdef CONFIG_KGDB_KDB
689 case 'R':
690 if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
691 int len = strlen(remcom_in_buffer + 6);
692
693 if ((len % 2) != 0) {
694 strcpy(remcom_out_buffer, "E01");
695 break;
696 }
697 kgdb_hex2mem(remcom_in_buffer + 6,
698 remcom_out_buffer, len);
699 len = len / 2;
700 remcom_out_buffer[len++] = 0;
701
702 kdb_parse(remcom_out_buffer);
703 strcpy(remcom_out_buffer, "OK");
704 }
705 break;
706#endif
707 }
708}
709
710/* Handle the 'H' task query packets */
711static void gdb_cmd_task(struct kgdb_state *ks)
712{
713 struct task_struct *thread;
714 char *ptr;
715
716 switch (remcom_in_buffer[1]) {
717 case 'g':
718 ptr = &remcom_in_buffer[2];
719 kgdb_hex2long(&ptr, &ks->threadid);
720 thread = getthread(ks->linux_regs, ks->threadid);
721 if (!thread && ks->threadid > 0) {
722 error_packet(remcom_out_buffer, -EINVAL);
723 break;
724 }
725 kgdb_usethread = thread;
726 ks->kgdb_usethreadid = ks->threadid;
727 strcpy(remcom_out_buffer, "OK");
728 break;
729 case 'c':
730 ptr = &remcom_in_buffer[2];
731 kgdb_hex2long(&ptr, &ks->threadid);
732 if (!ks->threadid) {
733 kgdb_contthread = NULL;
734 } else {
735 thread = getthread(ks->linux_regs, ks->threadid);
736 if (!thread && ks->threadid > 0) {
737 error_packet(remcom_out_buffer, -EINVAL);
738 break;
739 }
740 kgdb_contthread = thread;
741 }
742 strcpy(remcom_out_buffer, "OK");
743 break;
744 }
745}
746
747/* Handle the 'T' thread query packets */
748static void gdb_cmd_thread(struct kgdb_state *ks)
749{
750 char *ptr = &remcom_in_buffer[1];
751 struct task_struct *thread;
752
753 kgdb_hex2long(&ptr, &ks->threadid);
754 thread = getthread(ks->linux_regs, ks->threadid);
755 if (thread)
756 strcpy(remcom_out_buffer, "OK");
757 else
758 error_packet(remcom_out_buffer, -EINVAL);
759}
760
761/* Handle the 'z' or 'Z' breakpoint remove or set packets */
762static void gdb_cmd_break(struct kgdb_state *ks)
763{
764 /*
765 * Since GDB-5.3, it's been drafted that '0' is a software
766 * breakpoint, '1' is a hardware breakpoint, so let's do that.
767 */
768 char *bpt_type = &remcom_in_buffer[1];
769 char *ptr = &remcom_in_buffer[2];
770 unsigned long addr;
771 unsigned long length;
772 int error = 0;
773
774 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
775 /* Unsupported */
776 if (*bpt_type > '4')
777 return;
778 } else {
779 if (*bpt_type != '0' && *bpt_type != '1')
780 /* Unsupported. */
781 return;
782 }
783
784 /*
785 * Test if this is a hardware breakpoint, and
786 * if we support it:
787 */
788 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
789 /* Unsupported. */
790 return;
791
792 if (*(ptr++) != ',') {
793 error_packet(remcom_out_buffer, -EINVAL);
794 return;
795 }
796 if (!kgdb_hex2long(&ptr, &addr)) {
797 error_packet(remcom_out_buffer, -EINVAL);
798 return;
799 }
800 if (*(ptr++) != ',' ||
801 !kgdb_hex2long(&ptr, &length)) {
802 error_packet(remcom_out_buffer, -EINVAL);
803 return;
804 }
805
806 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
807 error = dbg_set_sw_break(addr);
808 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
809 error = dbg_remove_sw_break(addr);
810 else if (remcom_in_buffer[0] == 'Z')
811 error = arch_kgdb_ops.set_hw_breakpoint(addr,
812 (int)length, *bpt_type - '0');
813 else if (remcom_in_buffer[0] == 'z')
814 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
815 (int) length, *bpt_type - '0');
816
817 if (error == 0)
818 strcpy(remcom_out_buffer, "OK");
819 else
820 error_packet(remcom_out_buffer, error);
821}
822
823/* Handle the 'C' signal / exception passing packets */
824static int gdb_cmd_exception_pass(struct kgdb_state *ks)
825{
826 /* C09 == pass exception
827 * C15 == detach kgdb, pass exception
828 */
829 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
830
831 ks->pass_exception = 1;
832 remcom_in_buffer[0] = 'c';
833
834 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
835
836 ks->pass_exception = 1;
837 remcom_in_buffer[0] = 'D';
838 dbg_remove_all_break();
839 kgdb_connected = 0;
840 return 1;
841
842 } else {
843 gdbstub_msg_write("KGDB only knows signal 9 (pass)"
844 " and 15 (pass and disconnect)\n"
845 "Executing a continue without signal passing\n", 0);
846 remcom_in_buffer[0] = 'c';
847 }
848
849 /* Indicate fall through */
850 return -1;
851}
852
853/*
854 * This function performs all gdbserial command procesing
855 */
856int gdb_serial_stub(struct kgdb_state *ks)
857{
858 int error = 0;
859 int tmp;
860
861 /* Clear the out buffer. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
863
864 if (kgdb_connected) {
865 unsigned char thref[8];
866 char *ptr;
867
868 /* Reply to host that an exception has occurred */
869 ptr = remcom_out_buffer;
870 *ptr++ = 'T';
871 ptr = pack_hex_byte(ptr, ks->signo);
872 ptr += strlen(strcpy(ptr, "thread:"));
873 int_to_threadref(thref, shadow_pid(current->pid));
874 ptr = pack_threadid(ptr, thref);
875 *ptr++ = ';';
876 put_packet(remcom_out_buffer);
877 }
878
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) {
884 error = 0;
885
886 /* Clear the out buffer. */
887 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
888
889 get_packet(remcom_in_buffer);
890
891 switch (remcom_in_buffer[0]) {
892 case '?': /* gdbserial status */
893 gdb_cmd_status(ks);
894 break;
895 case 'g': /* return the value of the CPU registers */
896 gdb_cmd_getregs(ks);
897 break;
898 case 'G': /* set the value of the CPU registers - return OK */
899 gdb_cmd_setregs(ks);
900 break;
901 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
902 gdb_cmd_memread(ks);
903 break;
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks);
906 break;
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks);
909 break;
910 /* kill or detach. KGDB should treat this like a
911 * continue.
912 */
913 case 'D': /* Debugger detach */
914 case 'k': /* Debugger detach via kill */
915 gdb_cmd_detachkill(ks);
916 goto default_handle;
917 case 'R': /* Reboot */
918 if (gdb_cmd_reboot(ks))
919 goto default_handle;
920 break;
921 case 'q': /* query command */
922 gdb_cmd_query(ks);
923 break;
924 case 'H': /* task related */
925 gdb_cmd_task(ks);
926 break;
927 case 'T': /* Query thread status */
928 gdb_cmd_thread(ks);
929 break;
930 case 'z': /* Break point remove */
931 case 'Z': /* Break point set */
932 gdb_cmd_break(ks);
933 break;
934#ifdef CONFIG_KGDB_KDB
935 case '3': /* Escape into back into kdb */
936 if (remcom_in_buffer[1] == '\0') {
937 gdb_cmd_detachkill(ks);
938 return DBG_PASS_EVENT;
939 }
940#endif
941 case 'C': /* Exception passing */
942 tmp = gdb_cmd_exception_pass(ks);
943 if (tmp > 0)
944 goto default_handle;
945 if (tmp == 0)
946 break;
947 /* Fall through on tmp < 0 */
948 case 'c': /* Continue packet */
949 case 's': /* Single step packet */
950 if (kgdb_contthread && kgdb_contthread != current) {
951 /* Can't switch threads in kgdb */
952 error_packet(remcom_out_buffer, -EINVAL);
953 break;
954 }
955 dbg_activate_sw_breakpoints();
956 /* Fall through to default processing */
957 default:
958default_handle:
959 error = kgdb_arch_handle_exception(ks->ex_vector,
960 ks->signo,
961 ks->err_code,
962 remcom_in_buffer,
963 remcom_out_buffer,
964 ks->linux_regs);
965 /*
966 * Leave cmd processing on error, detach,
967 * kill, continue, or single step.
968 */
969 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
970 remcom_in_buffer[0] == 'k') {
971 error = 0;
972 goto kgdb_exit;
973 }
974
975 }
976
977 /* reply to the request */
978 put_packet(remcom_out_buffer);
979 }
980
981kgdb_exit:
982 if (ks->pass_exception)
983 error = 1;
984 return error;
985}
986
987int gdbstub_state(struct kgdb_state *ks, char *cmd)
988{
989 int error;
990
991 switch (cmd[0]) {
992 case 'e':
993 error = kgdb_arch_handle_exception(ks->ex_vector,
994 ks->signo,
995 ks->err_code,
996 remcom_in_buffer,
997 remcom_out_buffer,
998 ks->linux_regs);
999 return error;
1000 case 's':
1001 case 'c':
1002 strcpy(remcom_in_buffer, cmd);
1003 return 0;
1004 case '?':
1005 gdb_cmd_status(ks);
1006 break;
1007 case '\0':
1008 strcpy(remcom_out_buffer, "");
1009 break;
1010 }
1011 dbg_io_ops->write_char('+');
1012 put_packet(remcom_out_buffer);
1013 return 0;
1014}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
1# This file is subject to the terms and conditions of the GNU General Public
2# License. See the file "COPYING" in the main directory of this archive
3# for more details.
4#
5# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7#
8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12
13clean-files := gen-kdb_cmds.c
14
15quiet_cmd_gen-kdb = GENKDB $@
16 cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
17 /^\#/{next} \
18 /^[ \t]*$$/{next} \
19 {gsub(/"/, "\\\"", $$0); \
20 print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
21 END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
22 $(filter-out %/Makefile,$^) > $@#
23
24$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
25 $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
1/*
2 * Kernel Debugger Architecture Independent Breakpoint Handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/kdb.h>
16#include <linux/kgdb.h>
17#include <linux/smp.h>
18#include <linux/sched.h>
19#include <linux/interrupt.h>
20#include "kdb_private.h"
21
22/*
23 * Table of kdb_breakpoints
24 */
25kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
26
27static void kdb_setsinglestep(struct pt_regs *regs)
28{
29 KDB_STATE_SET(DOING_SS);
30}
31
32static char *kdb_rwtypes[] = {
33 "Instruction(i)",
34 "Instruction(Register)",
35 "Data Write",
36 "I/O",
37 "Data Access"
38};
39
40static char *kdb_bptype(kdb_bp_t *bp)
41{
42 if (bp->bp_type < 0 || bp->bp_type > 4)
43 return "";
44
45 return kdb_rwtypes[bp->bp_type];
46}
47
48static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
49{
50 int nextarg = *nextargp;
51 int diag;
52
53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else
62 return KDB_ARGCOUNT;
63
64 bp->bph_length = 1;
65
66 nextarg++;
67
68 if ((argc + 1) != nextarg) {
69 unsigned long len;
70
71 diag = kdbgetularg((char *)argv[nextarg],
72 &len);
73 if (diag)
74 return diag;
75
76
77 if (len > 8)
78 return KDB_BADLENGTH;
79
80 bp->bph_length = len;
81 nextarg++;
82 }
83
84 if ((argc + 1) != nextarg)
85 return KDB_ARGCOUNT;
86 }
87
88 *nextargp = nextarg;
89 return 0;
90}
91
92static int _kdb_bp_remove(kdb_bp_t *bp)
93{
94 int ret = 1;
95 if (!bp->bp_installed)
96 return ret;
97 if (!bp->bp_type)
98 ret = dbg_remove_sw_break(bp->bp_addr);
99 else
100 ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
101 bp->bph_length,
102 bp->bp_type);
103 if (ret == 0)
104 bp->bp_installed = 0;
105 return ret;
106}
107
108static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
109{
110 if (KDB_DEBUG(BP))
111 kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
112
113 /*
114 * Setup single step
115 */
116 kdb_setsinglestep(regs);
117
118 /*
119 * Reset delay attribute
120 */
121 bp->bp_delay = 0;
122 bp->bp_delayed = 1;
123}
124
125static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
126{
127 int ret;
128 /*
129 * Install the breakpoint, if it is not already installed.
130 */
131
132 if (KDB_DEBUG(BP))
133 kdb_printf("%s: bp_installed %d\n",
134 __func__, bp->bp_installed);
135 if (!KDB_STATE(SSBPT))
136 bp->bp_delay = 0;
137 if (bp->bp_installed)
138 return 1;
139 if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
140 if (KDB_DEBUG(BP))
141 kdb_printf("%s: delayed bp\n", __func__);
142 kdb_handle_bp(regs, bp);
143 return 0;
144 }
145 if (!bp->bp_type)
146 ret = dbg_set_sw_break(bp->bp_addr);
147 else
148 ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
149 bp->bph_length,
150 bp->bp_type);
151 if (ret == 0) {
152 bp->bp_installed = 1;
153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr);
156 return 1;
157 }
158 return 0;
159}
160
161/*
162 * kdb_bp_install
163 *
164 * Install kdb_breakpoints prior to returning from the
165 * kernel debugger. This allows the kdb_breakpoints to be set
166 * upon functions that are used internally by kdb, such as
167 * printk(). This function is only called once per kdb session.
168 */
169void kdb_bp_install(struct pt_regs *regs)
170{
171 int i;
172
173 for (i = 0; i < KDB_MAXBPT; i++) {
174 kdb_bp_t *bp = &kdb_breakpoints[i];
175
176 if (KDB_DEBUG(BP)) {
177 kdb_printf("%s: bp %d bp_enabled %d\n",
178 __func__, i, bp->bp_enabled);
179 }
180 if (bp->bp_enabled)
181 _kdb_bp_install(regs, bp);
182 }
183}
184
185/*
186 * kdb_bp_remove
187 *
188 * Remove kdb_breakpoints upon entry to the kernel debugger.
189 *
190 * Parameters:
191 * None.
192 * Outputs:
193 * None.
194 * Returns:
195 * None.
196 * Locking:
197 * None.
198 * Remarks:
199 */
200void kdb_bp_remove(void)
201{
202 int i;
203
204 for (i = KDB_MAXBPT - 1; i >= 0; i--) {
205 kdb_bp_t *bp = &kdb_breakpoints[i];
206
207 if (KDB_DEBUG(BP)) {
208 kdb_printf("%s: bp %d bp_enabled %d\n",
209 __func__, i, bp->bp_enabled);
210 }
211 if (bp->bp_enabled)
212 _kdb_bp_remove(bp);
213 }
214}
215
216
217/*
218 * kdb_printbp
219 *
220 * Internal function to format and print a breakpoint entry.
221 *
222 * Parameters:
223 * None.
224 * Outputs:
225 * None.
226 * Returns:
227 * None.
228 * Locking:
229 * None.
230 * Remarks:
231 */
232
233static void kdb_printbp(kdb_bp_t *bp, int i)
234{
235 kdb_printf("%s ", kdb_bptype(bp));
236 kdb_printf("BP #%d at ", i);
237 kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
238
239 if (bp->bp_enabled)
240 kdb_printf("\n is enabled");
241 else
242 kdb_printf("\n is disabled");
243
244 kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
245 bp->bp_addr, bp->bp_type, bp->bp_installed);
246
247 kdb_printf("\n");
248}
249
250/*
251 * kdb_bp
252 *
253 * Handle the bp commands.
254 *
255 * [bp|bph] <addr-expression> [DATAR|DATAW]
256 *
257 * Parameters:
258 * argc Count of arguments in argv
259 * argv Space delimited command line arguments
260 * Outputs:
261 * None.
262 * Returns:
263 * Zero for success, a kdb diagnostic if failure.
264 * Locking:
265 * None.
266 * Remarks:
267 *
268 * bp Set breakpoint on all cpus. Only use hardware assist if need.
269 * bph Set breakpoint on all cpus. Force hardware register
270 */
271
272static int kdb_bp(int argc, const char **argv)
273{
274 int i, bpno;
275 kdb_bp_t *bp, *bp_check;
276 int diag;
277 int free;
278 char *symname = NULL;
279 long offset = 0ul;
280 int nextarg;
281 kdb_bp_t template = {0};
282
283 if (argc == 0) {
284 /*
285 * Display breakpoint table
286 */
287 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
288 bpno++, bp++) {
289 if (bp->bp_free)
290 continue;
291 kdb_printbp(bp, bpno);
292 }
293
294 return 0;
295 }
296
297 nextarg = 1;
298 diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
299 &offset, &symname);
300 if (diag)
301 return diag;
302 if (!template.bp_addr)
303 return KDB_BADINT;
304
305 /*
306 * Find an empty bp structure to allocate
307 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free)
311 break;
312 }
313
314 if (bpno == KDB_MAXBPT)
315 return KDB_TOOMANYBPT;
316
317 if (strcmp(argv[0], "bph") == 0) {
318 template.bp_type = BP_HARDWARE_BREAKPOINT;
319 diag = kdb_parsebp(argc, argv, &nextarg, &template);
320 if (diag)
321 return diag;
322 } else {
323 template.bp_type = BP_BREAKPOINT;
324 }
325
326 /*
327 * Check for clashing breakpoints.
328 *
329 * Note, in this design we can't have hardware breakpoints
330 * enabled for both read and write on the same address.
331 */
332 for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
333 i++, bp_check++) {
334 if (!bp_check->bp_free &&
335 bp_check->bp_addr == template.bp_addr) {
336 kdb_printf("You already have a breakpoint at "
337 kdb_bfd_vma_fmt0 "\n", template.bp_addr);
338 return KDB_DUPBPT;
339 }
340 }
341
342 template.bp_enabled = 1;
343
344 /*
345 * Actually allocate the breakpoint found earlier
346 */
347 *bp = template;
348 bp->bp_free = 0;
349
350 kdb_printbp(bp, bpno);
351
352 return 0;
353}
354
355/*
356 * kdb_bc
357 *
358 * Handles the 'bc', 'be', and 'bd' commands
359 *
360 * [bd|bc|be] <breakpoint-number>
361 * [bd|bc|be] *
362 *
363 * Parameters:
364 * argc Count of arguments in argv
365 * argv Space delimited command line arguments
366 * Outputs:
367 * None.
368 * Returns:
369 * Zero for success, a kdb diagnostic for failure
370 * Locking:
371 * None.
372 * Remarks:
373 */
374static int kdb_bc(int argc, const char **argv)
375{
376 unsigned long addr;
377 kdb_bp_t *bp = NULL;
378 int lowbp = KDB_MAXBPT;
379 int highbp = 0;
380 int done = 0;
381 int i;
382 int diag = 0;
383
384 int cmd; /* KDBCMD_B? */
385#define KDBCMD_BC 0
386#define KDBCMD_BE 1
387#define KDBCMD_BD 2
388
389 if (strcmp(argv[0], "be") == 0)
390 cmd = KDBCMD_BE;
391 else if (strcmp(argv[0], "bd") == 0)
392 cmd = KDBCMD_BD;
393 else
394 cmd = KDBCMD_BC;
395
396 if (argc != 1)
397 return KDB_ARGCOUNT;
398
399 if (strcmp(argv[1], "*") == 0) {
400 lowbp = 0;
401 highbp = KDB_MAXBPT;
402 } else {
403 diag = kdbgetularg(argv[1], &addr);
404 if (diag)
405 return diag;
406
407 /*
408 * For addresses less than the maximum breakpoint number,
409 * assume that the breakpoint number is desired.
410 */
411 if (addr < KDB_MAXBPT) {
412 bp = &kdb_breakpoints[addr];
413 lowbp = highbp = addr;
414 highbp++;
415 } else {
416 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
417 i++, bp++) {
418 if (bp->bp_addr == addr) {
419 lowbp = highbp = i;
420 highbp++;
421 break;
422 }
423 }
424 }
425 }
426
427 /*
428 * Now operate on the set of breakpoints matching the input
429 * criteria (either '*' for all, or an individual breakpoint).
430 */
431 for (bp = &kdb_breakpoints[lowbp], i = lowbp;
432 i < highbp;
433 i++, bp++) {
434 if (bp->bp_free)
435 continue;
436
437 done++;
438
439 switch (cmd) {
440 case KDBCMD_BC:
441 bp->bp_enabled = 0;
442
443 kdb_printf("Breakpoint %d at "
444 kdb_bfd_vma_fmt " cleared\n",
445 i, bp->bp_addr);
446
447 bp->bp_addr = 0;
448 bp->bp_free = 1;
449
450 break;
451 case KDBCMD_BE:
452 bp->bp_enabled = 1;
453
454 kdb_printf("Breakpoint %d at "
455 kdb_bfd_vma_fmt " enabled",
456 i, bp->bp_addr);
457
458 kdb_printf("\n");
459 break;
460 case KDBCMD_BD:
461 if (!bp->bp_enabled)
462 break;
463
464 bp->bp_enabled = 0;
465
466 kdb_printf("Breakpoint %d at "
467 kdb_bfd_vma_fmt " disabled\n",
468 i, bp->bp_addr);
469
470 break;
471 }
472 if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
473 bp->bp_delay = 0;
474 KDB_STATE_CLEAR(SSBPT);
475 }
476 }
477
478 return (!done) ? KDB_BPTNOTFOUND : 0;
479}
480
481/*
482 * kdb_ss
483 *
484 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
485 * commands.
486 *
487 * ss
488 * ssb
489 *
490 * Parameters:
491 * argc Argument count
492 * argv Argument vector
493 * Outputs:
494 * None.
495 * Returns:
496 * KDB_CMD_SS[B] for success, a kdb error if failure.
497 * Locking:
498 * None.
499 * Remarks:
500 *
501 * Set the arch specific option to trigger a debug trap after the next
502 * instruction.
503 *
504 * For 'ssb', set the trace flag in the debug trap handler
505 * after printing the current insn and return directly without
506 * invoking the kdb command processor, until a branch instruction
507 * is encountered.
508 */
509
510static int kdb_ss(int argc, const char **argv)
511{
512 int ssb = 0;
513
514 ssb = (strcmp(argv[0], "ssb") == 0);
515 if (argc != 0)
516 return KDB_ARGCOUNT;
517 /*
518 * Set trace flag and go.
519 */
520 KDB_STATE_SET(DOING_SS);
521 if (ssb) {
522 KDB_STATE_SET(DOING_SSB);
523 return KDB_CMD_SSB;
524 }
525 return KDB_CMD_SS;
526}
527
528/* Initialize the breakpoint table and register breakpoint commands. */
529
530void __init kdb_initbptab(void)
531{
532 int i;
533 kdb_bp_t *bp;
534
535 /*
536 * First time initialization.
537 */
538 memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
539
540 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
541 bp->bp_free = 1;
542
543 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
544 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
545 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
546 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
547 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
548 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
549 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
550 kdb_register_repeat("bc", kdb_bc, "<bpnum>",
551 "Clear Breakpoint", 0, KDB_REPEAT_NONE);
552 kdb_register_repeat("be", kdb_bc, "<bpnum>",
553 "Enable Breakpoint", 0, KDB_REPEAT_NONE);
554 kdb_register_repeat("bd", kdb_bc, "<bpnum>",
555 "Disable Breakpoint", 0, KDB_REPEAT_NONE);
556
557 kdb_register_repeat("ss", kdb_ss, "",
558 "Single Step", 1, KDB_REPEAT_NO_ARGS);
559 kdb_register_repeat("ssb", kdb_ss, "",
560 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
561 /*
562 * Architecture dependent initialization.
563 */
564}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
1/*
2 * Kernel Debugger Architecture Independent Stack Traceback
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/ctype.h>
13#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/kdb.h>
17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h"
20
21
22static void kdb_show_stack(struct task_struct *p, void *addr)
23{
24 int old_lvl = console_loglevel;
25 console_loglevel = 15;
26 kdb_trap_printk++;
27 kdb_set_current_task(p);
28 if (addr) {
29 show_stack((struct task_struct *)p, addr);
30 } else if (kdb_current_regs) {
31#ifdef CONFIG_X86
32 show_stack(p, &kdb_current_regs->sp);
33#else
34 show_stack(p, NULL);
35#endif
36 } else {
37 show_stack(p, NULL);
38 }
39 console_loglevel = old_lvl;
40 kdb_trap_printk--;
41}
42
43/*
44 * kdb_bt
45 *
46 * This function implements the 'bt' command. Print a stack
47 * traceback.
48 *
49 * bt [<address-expression>] (addr-exp is for alternate stacks)
50 * btp <pid> Kernel stack for <pid>
51 * btt <address-expression> Kernel stack for task structure at
52 * <address-expression>
53 * bta [DRSTCZEUIMA] All useful processes, optionally
54 * filtered by state
55 * btc [<cpu>] The current process on one cpu,
56 * default is all cpus
57 *
58 * bt <address-expression> refers to a address on the stack, that location
59 * is assumed to contain a return address.
60 *
61 * btt <address-expression> refers to the address of a struct task.
62 *
63 * Inputs:
64 * argc argument count
65 * argv argument vector
66 * Outputs:
67 * None.
68 * Returns:
69 * zero for success, a kdb diagnostic if error
70 * Locking:
71 * none.
72 * Remarks:
73 * Backtrack works best when the code uses frame pointers. But even
74 * without frame pointers we should get a reasonable trace.
75 *
76 * mds comes in handy when examining the stack to do a manual traceback or
77 * to get a starting point for bt <address-expression>.
78 */
79
80static int
81kdb_bt1(struct task_struct *p, unsigned long mask,
82 int argcount, int btaprompt)
83{
84 char buffer[2];
85 if (kdb_getarea(buffer[0], (unsigned long)p) ||
86 kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
87 return KDB_BADADDR;
88 if (!kdb_task_state(p, mask))
89 return 0;
90 kdb_printf("Stack traceback for pid %d\n", p->pid);
91 kdb_ps1(p);
92 kdb_show_stack(p, NULL);
93 if (btaprompt) {
94 kdb_getstr(buffer, sizeof(buffer),
95 "Enter <q> to end, <cr> to continue:");
96 if (buffer[0] == 'q') {
97 kdb_printf("\n");
98 return 1;
99 }
100 }
101 touch_nmi_watchdog();
102 return 0;
103}
104
105int
106kdb_bt(int argc, const char **argv)
107{
108 int diag;
109 int argcount = 5;
110 int btaprompt = 1;
111 int nextarg;
112 unsigned long addr;
113 long offset;
114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
117 * proc in bta */
118
119 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p;
121 unsigned long cpu;
122 unsigned long mask = kdb_task_state_string(argc ? argv[1] :
123 NULL);
124 if (argc == 0)
125 kdb_ps_suppressed();
126 /* Run the active tasks first */
127 for_each_online_cpu(cpu) {
128 p = kdb_curr_task(cpu);
129 if (kdb_bt1(p, mask, argcount, btaprompt))
130 return 0;
131 }
132 /* Now the inactive tasks */
133 kdb_do_each_thread(g, p) {
134 if (task_curr(p))
135 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt))
137 return 0;
138 } kdb_while_each_thread(g, p);
139 } else if (strcmp(argv[0], "btp") == 0) {
140 struct task_struct *p;
141 unsigned long pid;
142 if (argc != 1)
143 return KDB_ARGCOUNT;
144 diag = kdbgetularg((char *)argv[1], &pid);
145 if (diag)
146 return diag;
147 p = find_task_by_pid_ns(pid, &init_pid_ns);
148 if (p) {
149 kdb_set_current_task(p);
150 return kdb_bt1(p, ~0UL, argcount, 0);
151 }
152 kdb_printf("No process with pid == %ld found\n", pid);
153 return 0;
154 } else if (strcmp(argv[0], "btt") == 0) {
155 if (argc != 1)
156 return KDB_ARGCOUNT;
157 diag = kdbgetularg((char *)argv[1], &addr);
158 if (diag)
159 return diag;
160 kdb_set_current_task((struct task_struct *)addr);
161 return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
162 } else if (strcmp(argv[0], "btc") == 0) {
163 unsigned long cpu = ~0;
164 struct task_struct *save_current_task = kdb_current_task;
165 char buf[80];
166 if (argc > 1)
167 return KDB_ARGCOUNT;
168 if (argc == 1) {
169 diag = kdbgetularg((char *)argv[1], &cpu);
170 if (diag)
171 return diag;
172 }
173 /* Recursive use of kdb_parse, do not use argv after
174 * this point */
175 argv = NULL;
176 if (cpu != ~0) {
177 if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
178 kdb_printf("no process for cpu %ld\n", cpu);
179 return 0;
180 }
181 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
182 kdb_parse(buf);
183 return 0;
184 }
185 kdb_printf("btc: cpu status: ");
186 kdb_parse("cpu\n");
187 for_each_online_cpu(cpu) {
188 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
189 kdb_parse(buf);
190 touch_nmi_watchdog();
191 }
192 kdb_set_current_task(save_current_task);
193 return 0;
194 } else {
195 if (argc) {
196 nextarg = 1;
197 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
198 &offset, NULL);
199 if (diag)
200 return diag;
201 kdb_show_stack(kdb_current_task, (void *)addr);
202 return 0;
203 } else {
204 return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
205 }
206 }
207
208 /* NOTREACHED */
209 return 0;
210}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
1# Initial commands for kdb, alter to suit your needs.
2# These commands are executed in kdb_init() context, no SMP, no
3# processes. Commands that require process data (including stack or
4# registers) are not reliable this early. set and bp commands should
5# be safe. Global breakpoint commands affect each cpu as it is booted.
6
7# Standard debugging information for first level support, just type archkdb
8# or archkdbcpu or archkdbshort at the kdb prompt.
9
10defcmd dumpcommon "" "Common kdb debugging"
11 set BTAPROMPT 0
12 set LINES 10000
13 -summary
14 -cpu
15 -ps
16 -dmesg 600
17 -bt
18endefcmd
19
20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R
24 -dumpcommon
25 -bta
26endefcmd
27
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R
32 -dumpcommon
33 -btc
34endefcmd
35
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/kgdb.h>
12#include <linux/kdb.h>
13#include <linux/kdebug.h>
14#include "kdb_private.h"
15#include "../debug_core.h"
16
17/*
18 * KDB interface to KGDB internals
19 */
20get_char_func kdb_poll_funcs[] = {
21 dbg_io_get_char,
22 NULL,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27};
28EXPORT_SYMBOL_GPL(kdb_poll_funcs);
29
30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32
33int kdb_stub(struct kgdb_state *ks)
34{
35 int error = 0;
36 kdb_bp_t *bp;
37 unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
38 kdb_reason_t reason = KDB_REASON_OOPS;
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i;
41
42 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY);
45 addr = instruction_pointer(ks->linux_regs);
46 }
47 ks->pass_exception = 0;
48 if (atomic_read(&kgdb_setting_breakpoint))
49 reason = KDB_REASON_KEYBOARD;
50
51 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
52 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
53 reason = KDB_REASON_BREAK;
54 db_result = KDB_DB_BPT;
55 if (addr != instruction_pointer(ks->linux_regs))
56 kgdb_arch_set_pc(ks->linux_regs, addr);
57 break;
58 }
59 }
60 if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
61 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
62 if (bp->bp_free)
63 continue;
64 if (bp->bp_addr == addr) {
65 bp->bp_delay = 1;
66 bp->bp_delayed = 1;
67 /*
68 * SSBPT is set when the kernel debugger must single step a
69 * task in order to re-establish an instruction breakpoint
70 * which uses the instruction replacement mechanism. It is
71 * cleared by any action that removes the need to single-step
72 * the breakpoint.
73 */
74 reason = KDB_REASON_BREAK;
75 db_result = KDB_DB_BPT;
76 KDB_STATE_SET(SSBPT);
77 break;
78 }
79 }
80 }
81
82 if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
83 ks->signo == SIGTRAP) {
84 reason = KDB_REASON_SSTEP;
85 db_result = KDB_DB_BPT;
86 }
87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu;
90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */
93 kdb_bp_remove();
94 KDB_STATE_CLEAR(DOING_SS);
95 KDB_STATE_CLEAR(DOING_SSB);
96 KDB_STATE_SET(PAGER);
97 /* zero out any offline cpu data */
98 for_each_present_cpu(i) {
99 if (!cpu_online(i)) {
100 kgdb_info[i].debuggerinfo = NULL;
101 kgdb_info[i].task = NULL;
102 }
103 }
104 if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC);
107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS);
112 } else {
113 /* Start kdb main loop */
114 error = kdb_main_loop(KDB_REASON_ENTER, reason,
115 ks->err_code, db_result, ks->linux_regs);
116 }
117 /*
118 * Upon exit from the kdb main loop setup break points and restart
119 * the system based on the requested continue state
120 */
121 kdb_initial_cpu = -1;
122 kdb_current_task = NULL;
123 kdb_current_regs = NULL;
124 KDB_STATE_CLEAR(PAGER);
125 kdbnearsym_cleanup();
126 if (error == KDB_CMD_KGDB) {
127 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
128 /*
129 * This inteface glue which allows kdb to transition in into
130 * the gdb stub. In order to do this the '?' or '' gdb serial
131 * packet response is processed here. And then control is
132 * passed to the gdbstub.
133 */
134 if (KDB_STATE(DOING_KGDB))
135 gdbstub_state(ks, "?");
136 else
137 gdbstub_state(ks, "");
138 KDB_STATE_CLEAR(DOING_KGDB);
139 KDB_STATE_CLEAR(DOING_KGDB2);
140 }
141 return DBG_PASS_EVENT;
142 }
143 kdb_bp_install(ks->linux_regs);
144 dbg_activate_sw_breakpoints();
145 /* Set the exit state to a single step or a continue */
146 if (KDB_STATE(DOING_SS))
147 gdbstub_state(ks, "s");
148 else
149 gdbstub_state(ks, "c");
150
151 KDB_FLAG_CLEAR(CATASTROPHIC);
152
153 /* Invoke arch specific exception handling prior to system resume */
154 kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
155 if (ks->pass_exception)
156 kgdb_info[ks->cpu].ret_state = 1;
157 if (error == KDB_CMD_CPU) {
158 KDB_STATE_SET(REENTRY);
159 /*
160 * Force clear the single step bit because kdb emulates this
161 * differently vs the gdbstub
162 */
163 kgdb_single_step = 0;
164 dbg_deactivate_sw_breakpoints();
165 return DBG_SWITCH_CPU_EVENT;
166 }
167 return kgdb_info[ks->cpu].ret_state;
168}
169
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
1/*
2 * Kernel Debugger Architecture Independent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/kdev_t.h>
18#include <linux/console.h>
19#include <linux/string.h>
20#include <linux/sched.h>
21#include <linux/smp.h>
22#include <linux/nmi.h>
23#include <linux/delay.h>
24#include <linux/kgdb.h>
25#include <linux/kdb.h>
26#include <linux/kallsyms.h>
27#include "kdb_private.h"
28
29#define CMD_BUFLEN 256
30char kdb_prompt_str[CMD_BUFLEN];
31
32int kdb_trap_printk;
33
34static void kgdb_transition_check(char *buffer)
35{
36 int slen = strlen(buffer);
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer);
42 }
43}
44
45static int kdb_read_get_key(char *buffer, size_t bufsize)
46{
47#define ESCAPE_UDELAY 1000
48#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
49 char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
50 char *ped = escape_data;
51 int escape_delay = 0;
52 get_char_func *f, *f_escape = NULL;
53 int key;
54
55 for (f = &kdb_poll_funcs[0]; ; ++f) {
56 if (*f == NULL) {
57 /* Reset NMI watchdog once per poll loop */
58 touch_nmi_watchdog();
59 f = &kdb_poll_funcs[0];
60 }
61 if (escape_delay == 2) {
62 *ped = '\0';
63 ped = escape_data;
64 --escape_delay;
65 }
66 if (escape_delay == 1) {
67 key = *ped++;
68 if (!*ped)
69 --escape_delay;
70 break;
71 }
72 key = (*f)();
73 if (key == -1) {
74 if (escape_delay) {
75 udelay(ESCAPE_UDELAY);
76 --escape_delay;
77 }
78 continue;
79 }
80 if (bufsize <= 2) {
81 if (key == '\r')
82 key = '\n';
83 *buffer++ = key;
84 *buffer = '\0';
85 return -1;
86 }
87 if (escape_delay == 0 && key == '\e') {
88 escape_delay = ESCAPE_DELAY;
89 ped = escape_data;
90 f_escape = f;
91 }
92 if (escape_delay) {
93 *ped++ = key;
94 if (f_escape != f) {
95 escape_delay = 2;
96 continue;
97 }
98 if (ped - escape_data == 1) {
99 /* \e */
100 continue;
101 } else if (ped - escape_data == 2) {
102 /* \e<something> */
103 if (key != '[')
104 escape_delay = 2;
105 continue;
106 } else if (ped - escape_data == 3) {
107 /* \e[<something> */
108 int mapkey = 0;
109 switch (key) {
110 case 'A': /* \e[A, up arrow */
111 mapkey = 16;
112 break;
113 case 'B': /* \e[B, down arrow */
114 mapkey = 14;
115 break;
116 case 'C': /* \e[C, right arrow */
117 mapkey = 6;
118 break;
119 case 'D': /* \e[D, left arrow */
120 mapkey = 2;
121 break;
122 case '1': /* dropthrough */
123 case '3': /* dropthrough */
124 /* \e[<1,3,4>], may be home, del, end */
125 case '4':
126 mapkey = -1;
127 break;
128 }
129 if (mapkey != -1) {
130 if (mapkey > 0) {
131 escape_data[0] = mapkey;
132 escape_data[1] = '\0';
133 }
134 escape_delay = 2;
135 }
136 continue;
137 } else if (ped - escape_data == 4) {
138 /* \e[<1,3,4><something> */
139 int mapkey = 0;
140 if (key == '~') {
141 switch (escape_data[2]) {
142 case '1': /* \e[1~, home */
143 mapkey = 1;
144 break;
145 case '3': /* \e[3~, del */
146 mapkey = 4;
147 break;
148 case '4': /* \e[4~, end */
149 mapkey = 5;
150 break;
151 }
152 }
153 if (mapkey > 0) {
154 escape_data[0] = mapkey;
155 escape_data[1] = '\0';
156 }
157 escape_delay = 2;
158 continue;
159 }
160 }
161 break; /* A key to process */
162 }
163 return key;
164}
165
166/*
167 * kdb_read
168 *
169 * This function reads a string of characters, terminated by
170 * a newline, or by reaching the end of the supplied buffer,
171 * from the current kernel debugger console device.
172 * Parameters:
173 * buffer - Address of character buffer to receive input characters.
174 * bufsize - size, in bytes, of the character buffer
175 * Returns:
176 * Returns a pointer to the buffer containing the received
177 * character string. This string will be terminated by a
178 * newline character.
179 * Locking:
180 * No locks are required to be held upon entry to this
181 * function. It is not reentrant - it relies on the fact
182 * that while kdb is running on only one "master debug" cpu.
183 * Remarks:
184 *
185 * The buffer size must be >= 2. A buffer size of 2 means that the caller only
186 * wants a single key.
187 *
188 * An escape key could be the start of a vt100 control sequence such as \e[D
189 * (left arrow) or it could be a character in its own right. The standard
190 * method for detecting the difference is to wait for 2 seconds to see if there
191 * are any other characters. kdb is complicated by the lack of a timer service
192 * (interrupts are off), by multiple input sources and by the need to sometimes
193 * return after just one key. Escape sequence processing has to be done as
194 * states in the polling loop.
195 */
196
197static char *kdb_read(char *buffer, size_t bufsize)
198{
199 char *cp = buffer;
200 char *bufend = buffer+bufsize-2; /* Reserve space for newline
201 * and null byte */
202 char *lastchar;
203 char *p_tmp;
204 char tmp;
205 static char tmpbuffer[CMD_BUFLEN];
206 int len = strlen(buffer);
207 int len_tmp;
208 int tab = 0;
209 int count;
210 int i;
211 int diag, dtab_count;
212 int key;
213
214
215 diag = kdbgetintenv("DTABCOUNT", &dtab_count);
216 if (diag)
217 dtab_count = 30;
218
219 if (len > 0) {
220 cp += len;
221 if (*(buffer+len-1) == '\n')
222 cp--;
223 }
224
225 lastchar = cp;
226 *cp = '\0';
227 kdb_printf("%s", buffer);
228poll_again:
229 key = kdb_read_get_key(buffer, bufsize);
230 if (key == -1)
231 return buffer;
232 if (key != 9)
233 tab = 0;
234 switch (key) {
235 case 8: /* backspace */
236 if (cp > buffer) {
237 if (cp < lastchar) {
238 memcpy(tmpbuffer, cp, lastchar - cp);
239 memcpy(cp-1, tmpbuffer, lastchar - cp);
240 }
241 *(--lastchar) = '\0';
242 --cp;
243 kdb_printf("\b%s \r", cp);
244 tmp = *cp;
245 *cp = '\0';
246 kdb_printf(kdb_prompt_str);
247 kdb_printf("%s", buffer);
248 *cp = tmp;
249 }
250 break;
251 case 13: /* enter */
252 *lastchar++ = '\n';
253 *lastchar++ = '\0';
254 kdb_printf("\n");
255 return buffer;
256 case 4: /* Del */
257 if (cp < lastchar) {
258 memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
259 memcpy(cp, tmpbuffer, lastchar - cp - 1);
260 *(--lastchar) = '\0';
261 kdb_printf("%s \r", cp);
262 tmp = *cp;
263 *cp = '\0';
264 kdb_printf(kdb_prompt_str);
265 kdb_printf("%s", buffer);
266 *cp = tmp;
267 }
268 break;
269 case 1: /* Home */
270 if (cp > buffer) {
271 kdb_printf("\r");
272 kdb_printf(kdb_prompt_str);
273 cp = buffer;
274 }
275 break;
276 case 5: /* End */
277 if (cp < lastchar) {
278 kdb_printf("%s", cp);
279 cp = lastchar;
280 }
281 break;
282 case 2: /* Left */
283 if (cp > buffer) {
284 kdb_printf("\b");
285 --cp;
286 }
287 break;
288 case 14: /* Down */
289 memset(tmpbuffer, ' ',
290 strlen(kdb_prompt_str) + (lastchar-buffer));
291 *(tmpbuffer+strlen(kdb_prompt_str) +
292 (lastchar-buffer)) = '\0';
293 kdb_printf("\r%s\r", tmpbuffer);
294 *lastchar = (char)key;
295 *(lastchar+1) = '\0';
296 return lastchar;
297 case 6: /* Right */
298 if (cp < lastchar) {
299 kdb_printf("%c", *cp);
300 ++cp;
301 }
302 break;
303 case 16: /* Up */
304 memset(tmpbuffer, ' ',
305 strlen(kdb_prompt_str) + (lastchar-buffer));
306 *(tmpbuffer+strlen(kdb_prompt_str) +
307 (lastchar-buffer)) = '\0';
308 kdb_printf("\r%s\r", tmpbuffer);
309 *lastchar = (char)key;
310 *(lastchar+1) = '\0';
311 return lastchar;
312 case 9: /* Tab */
313 if (tab < 2)
314 ++tab;
315 p_tmp = buffer;
316 while (*p_tmp == ' ')
317 p_tmp++;
318 if (p_tmp > cp)
319 break;
320 memcpy(tmpbuffer, p_tmp, cp-p_tmp);
321 *(tmpbuffer + (cp-p_tmp)) = '\0';
322 p_tmp = strrchr(tmpbuffer, ' ');
323 if (p_tmp)
324 ++p_tmp;
325 else
326 p_tmp = tmpbuffer;
327 len = strlen(p_tmp);
328 count = kallsyms_symbol_complete(p_tmp,
329 sizeof(tmpbuffer) -
330 (p_tmp - tmpbuffer));
331 if (tab == 2 && count > 0) {
332 kdb_printf("\n%d symbols are found.", count);
333 if (count > dtab_count) {
334 count = dtab_count;
335 kdb_printf(" But only first %d symbols will"
336 " be printed.\nYou can change the"
337 " environment variable DTABCOUNT.",
338 count);
339 }
340 kdb_printf("\n");
341 for (i = 0; i < count; i++) {
342 if (kallsyms_symbol_next(p_tmp, i) < 0)
343 break;
344 kdb_printf("%s ", p_tmp);
345 *(p_tmp + len) = '\0';
346 }
347 if (i >= dtab_count)
348 kdb_printf("...");
349 kdb_printf("\n");
350 kdb_printf(kdb_prompt_str);
351 kdb_printf("%s", buffer);
352 } else if (tab != 2 && count > 0) {
353 len_tmp = strlen(p_tmp);
354 strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
355 len_tmp = strlen(p_tmp);
356 strncpy(cp, p_tmp+len, len_tmp-len + 1);
357 len = len_tmp - len;
358 kdb_printf("%s", cp);
359 cp += len;
360 lastchar += len;
361 }
362 kdb_nextline = 1; /* reset output line number */
363 break;
364 default:
365 if (key >= 32 && lastchar < bufend) {
366 if (cp < lastchar) {
367 memcpy(tmpbuffer, cp, lastchar - cp);
368 memcpy(cp+1, tmpbuffer, lastchar - cp);
369 *++lastchar = '\0';
370 *cp = key;
371 kdb_printf("%s\r", cp);
372 ++cp;
373 tmp = *cp;
374 *cp = '\0';
375 kdb_printf(kdb_prompt_str);
376 kdb_printf("%s", buffer);
377 *cp = tmp;
378 } else {
379 *++lastchar = '\0';
380 *cp++ = key;
381 /* The kgdb transition check will hide
382 * printed characters if we think that
383 * kgdb is connecting, until the check
384 * fails */
385 if (!KDB_STATE(KGDB_TRANS))
386 kgdb_transition_check(buffer);
387 else
388 kdb_printf("%c", key);
389 }
390 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) {
393 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB);
395 return buffer;
396 }
397 if (lastchar - buffer >= 14 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) {
399 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2);
401 return buffer;
402 }
403 }
404 break;
405 }
406 goto poll_again;
407}
408
409/*
410 * kdb_getstr
411 *
412 * Print the prompt string and read a command from the
413 * input device.
414 *
415 * Parameters:
416 * buffer Address of buffer to receive command
417 * bufsize Size of buffer in bytes
418 * prompt Pointer to string to use as prompt string
419 * Returns:
420 * Pointer to command buffer.
421 * Locking:
422 * None.
423 * Remarks:
424 * For SMP kernels, the processor number will be
425 * substituted for %d, %x or %o in the prompt.
426 */
427
428char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
429{
430 if (prompt && kdb_prompt_str != prompt)
431 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
432 kdb_printf(kdb_prompt_str);
433 kdb_nextline = 1; /* Prompt and input resets line number */
434 return kdb_read(buffer, bufsize);
435}
436
437/*
438 * kdb_input_flush
439 *
440 * Get rid of any buffered console input.
441 *
442 * Parameters:
443 * none
444 * Returns:
445 * nothing
446 * Locking:
447 * none
448 * Remarks:
449 * Call this function whenever you want to flush input. If there is any
450 * outstanding input, it ignores all characters until there has been no
451 * data for approximately 1ms.
452 */
453
454static void kdb_input_flush(void)
455{
456 get_char_func *f;
457 int res;
458 int flush_delay = 1;
459 while (flush_delay) {
460 flush_delay--;
461empty:
462 touch_nmi_watchdog();
463 for (f = &kdb_poll_funcs[0]; *f; ++f) {
464 res = (*f)();
465 if (res != -1) {
466 flush_delay = 1;
467 goto empty;
468 }
469 }
470 if (flush_delay)
471 mdelay(1);
472 }
473}
474
475/*
476 * kdb_printf
477 *
478 * Print a string to the output device(s).
479 *
480 * Parameters:
481 * printf-like format and optional args.
482 * Returns:
483 * 0
484 * Locking:
485 * None.
486 * Remarks:
487 * use 'kdbcons->write()' to avoid polluting 'log_buf' with
488 * kdb output.
489 *
490 * If the user is doing a cmd args | grep srch
491 * then kdb_grepping_flag is set.
492 * In that case we need to accumulate full lines (ending in \n) before
493 * searching for the pattern.
494 */
495
496static char kdb_buffer[256]; /* A bit too big to go on stack */
497static char *next_avail = kdb_buffer;
498static int size_avail;
499static int suspend_grep;
500
501/*
502 * search arg1 to see if it contains arg2
503 * (kdmain.c provides flags for ^pat and pat$)
504 *
505 * return 1 for found, 0 for not found
506 */
507static int kdb_search_string(char *searched, char *searchfor)
508{
509 char firstchar, *cp;
510 int len1, len2;
511
512 /* not counting the newline at the end of "searched" */
513 len1 = strlen(searched)-1;
514 len2 = strlen(searchfor);
515 if (len1 < len2)
516 return 0;
517 if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
518 return 0;
519 if (kdb_grep_leading) {
520 if (!strncmp(searched, searchfor, len2))
521 return 1;
522 } else if (kdb_grep_trailing) {
523 if (!strncmp(searched+len1-len2, searchfor, len2))
524 return 1;
525 } else {
526 firstchar = *searchfor;
527 cp = searched;
528 while ((cp = strchr(cp, firstchar))) {
529 if (!strncmp(cp, searchfor, len2))
530 return 1;
531 cp++;
532 }
533 }
534 return 0;
535}
536
537int vkdb_printf(const char *fmt, va_list ap)
538{
539 int diag;
540 int linecount;
541 int logging, saved_loglevel = 0;
542 int saved_trap_printk;
543 int got_printf_lock = 0;
544 int retlen = 0;
545 int fnd, len;
546 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
547 char *moreprompt = "more> ";
548 struct console *c = console_drivers;
549 static DEFINE_SPINLOCK(kdb_printf_lock);
550 unsigned long uninitialized_var(flags);
551
552 preempt_disable();
553 saved_trap_printk = kdb_trap_printk;
554 kdb_trap_printk = 0;
555
556 /* Serialize kdb_printf if multiple cpus try to write at once.
557 * But if any cpu goes recursive in kdb, just print the output,
558 * even if it is interleaved with any other text.
559 */
560 if (!KDB_STATE(PRINTF_LOCK)) {
561 KDB_STATE_SET(PRINTF_LOCK);
562 spin_lock_irqsave(&kdb_printf_lock, flags);
563 got_printf_lock = 1;
564 atomic_inc(&kdb_event);
565 } else {
566 __acquire(kdb_printf_lock);
567 }
568
569 diag = kdbgetintenv("LINES", &linecount);
570 if (diag || linecount <= 1)
571 linecount = 24;
572
573 diag = kdbgetintenv("LOGGING", &logging);
574 if (diag)
575 logging = 0;
576
577 if (!kdb_grepping_flag || suspend_grep) {
578 /* normally, every vsnprintf starts a new buffer */
579 next_avail = kdb_buffer;
580 size_avail = sizeof(kdb_buffer);
581 }
582 vsnprintf(next_avail, size_avail, fmt, ap);
583
584 /*
585 * If kdb_parse() found that the command was cmd xxx | grep yyy
586 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
587 *
588 * Accumulate the print data up to a newline before searching it.
589 * (vsnprintf does null-terminate the string that it generates)
590 */
591
592 /* skip the search if prints are temporarily unconditional */
593 if (!suspend_grep && kdb_grepping_flag) {
594 cp = strchr(kdb_buffer, '\n');
595 if (!cp) {
596 /*
597 * Special cases that don't end with newlines
598 * but should be written without one:
599 * The "[nn]kdb> " prompt should
600 * appear at the front of the buffer.
601 *
602 * The "[nn]more " prompt should also be
603 * (MOREPROMPT -> moreprompt)
604 * written * but we print that ourselves,
605 * we set the suspend_grep flag to make
606 * it unconditional.
607 *
608 */
609 if (next_avail == kdb_buffer) {
610 /*
611 * these should occur after a newline,
612 * so they will be at the front of the
613 * buffer
614 */
615 cp2 = kdb_buffer;
616 len = strlen(kdb_prompt_str);
617 if (!strncmp(cp2, kdb_prompt_str, len)) {
618 /*
619 * We're about to start a new
620 * command, so we can go back
621 * to normal mode.
622 */
623 kdb_grepping_flag = 0;
624 goto kdb_printit;
625 }
626 }
627 /* no newline; don't search/write the buffer
628 until one is there */
629 len = strlen(kdb_buffer);
630 next_avail = kdb_buffer + len;
631 size_avail = sizeof(kdb_buffer) - len;
632 goto kdb_print_out;
633 }
634
635 /*
636 * The newline is present; print through it or discard
637 * it, depending on the results of the search.
638 */
639 cp++; /* to byte after the newline */
640 replaced_byte = *cp; /* remember what/where it was */
641 cphold = cp;
642 *cp = '\0'; /* end the string for our search */
643
644 /*
645 * We now have a newline at the end of the string
646 * Only continue with this output if it contains the
647 * search string.
648 */
649 fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
650 if (!fnd) {
651 /*
652 * At this point the complete line at the start
653 * of kdb_buffer can be discarded, as it does
654 * not contain what the user is looking for.
655 * Shift the buffer left.
656 */
657 *cphold = replaced_byte;
658 strcpy(kdb_buffer, cphold);
659 len = strlen(kdb_buffer);
660 next_avail = kdb_buffer + len;
661 size_avail = sizeof(kdb_buffer) - len;
662 goto kdb_print_out;
663 }
664 /*
665 * at this point the string is a full line and
666 * should be printed, up to the null.
667 */
668 }
669kdb_printit:
670
671 /*
672 * Write to all consoles.
673 */
674 retlen = strlen(kdb_buffer);
675 if (!dbg_kdb_mode && kgdb_connected) {
676 gdbstub_msg_write(kdb_buffer, retlen);
677 } else {
678 if (!dbg_io_ops->is_console) {
679 len = strlen(kdb_buffer);
680 cp = kdb_buffer;
681 while (len--) {
682 dbg_io_ops->write_char(*cp);
683 cp++;
684 }
685 }
686 while (c) {
687 c->write(c, kdb_buffer, retlen);
688 touch_nmi_watchdog();
689 c = c->next;
690 }
691 }
692 if (logging) {
693 saved_loglevel = console_loglevel;
694 console_loglevel = 0;
695 printk(KERN_INFO "%s", kdb_buffer);
696 }
697
698 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
699 kdb_nextline++;
700
701 /* check for having reached the LINES number of printed lines */
702 if (kdb_nextline == linecount) {
703 char buf1[16] = "";
704#if defined(CONFIG_SMP)
705 char buf2[32];
706#endif
707
708 /* Watch out for recursion here. Any routine that calls
709 * kdb_printf will come back through here. And kdb_read
710 * uses kdb_printf to echo on serial consoles ...
711 */
712 kdb_nextline = 1; /* In case of recursion */
713
714 /*
715 * Pause until cr.
716 */
717 moreprompt = kdbgetenv("MOREPROMPT");
718 if (moreprompt == NULL)
719 moreprompt = "more> ";
720
721#if defined(CONFIG_SMP)
722 if (strchr(moreprompt, '%')) {
723 sprintf(buf2, moreprompt, get_cpu());
724 put_cpu();
725 moreprompt = buf2;
726 }
727#endif
728
729 kdb_input_flush();
730 c = console_drivers;
731
732 if (!dbg_io_ops->is_console) {
733 len = strlen(moreprompt);
734 cp = moreprompt;
735 while (len--) {
736 dbg_io_ops->write_char(*cp);
737 cp++;
738 }
739 }
740 while (c) {
741 c->write(c, moreprompt, strlen(moreprompt));
742 touch_nmi_watchdog();
743 c = c->next;
744 }
745
746 if (logging)
747 printk("%s", moreprompt);
748
749 kdb_read(buf1, 2); /* '2' indicates to return
750 * immediately after getting one key. */
751 kdb_nextline = 1; /* Really set output line 1 */
752
753 /* empty and reset the buffer: */
754 kdb_buffer[0] = '\0';
755 next_avail = kdb_buffer;
756 size_avail = sizeof(kdb_buffer);
757 if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
758 /* user hit q or Q */
759 KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
760 KDB_STATE_CLEAR(PAGER);
761 /* end of command output; back to normal mode */
762 kdb_grepping_flag = 0;
763 kdb_printf("\n");
764 } else if (buf1[0] == ' ') {
765 kdb_printf("\n");
766 suspend_grep = 1; /* for this recursion */
767 } else if (buf1[0] == '\n') {
768 kdb_nextline = linecount - 1;
769 kdb_printf("\r");
770 suspend_grep = 1; /* for this recursion */
771 } else if (buf1[0] && buf1[0] != '\n') {
772 /* user hit something other than enter */
773 suspend_grep = 1; /* for this recursion */
774 kdb_printf("\nOnly 'q' or 'Q' are processed at more "
775 "prompt, input ignored\n");
776 } else if (kdb_grepping_flag) {
777 /* user hit enter */
778 suspend_grep = 1; /* for this recursion */
779 kdb_printf("\n");
780 }
781 kdb_input_flush();
782 }
783
784 /*
785 * For grep searches, shift the printed string left.
786 * replaced_byte contains the character that was overwritten with
787 * the terminating null, and cphold points to the null.
788 * Then adjust the notion of available space in the buffer.
789 */
790 if (kdb_grepping_flag && !suspend_grep) {
791 *cphold = replaced_byte;
792 strcpy(kdb_buffer, cphold);
793 len = strlen(kdb_buffer);
794 next_avail = kdb_buffer + len;
795 size_avail = sizeof(kdb_buffer) - len;
796 }
797
798kdb_print_out:
799 suspend_grep = 0; /* end of what may have been a recursive call */
800 if (logging)
801 console_loglevel = saved_loglevel;
802 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
803 got_printf_lock = 0;
804 spin_unlock_irqrestore(&kdb_printf_lock, flags);
805 KDB_STATE_CLEAR(PRINTF_LOCK);
806 atomic_dec(&kdb_event);
807 } else {
808 __release(kdb_printf_lock);
809 }
810 kdb_trap_printk = saved_trap_printk;
811 preempt_enable();
812 return retlen;
813}
814
815int kdb_printf(const char *fmt, ...)
816{
817 va_list ap;
818 int r;
819
820 va_start(ap, fmt);
821 r = vkdb_printf(fmt, ap);
822 va_end(ap);
823
824 return r;
825}
826
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
1/*
2 * Kernel Debugger Architecture Dependent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.
6 *
7 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
8 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
9 */
10
11#include <linux/kdb.h>
12#include <linux/keyboard.h>
13#include <linux/ctype.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17/* Keyboard Controller Registers on normal PCs. */
18
19#define KBD_STATUS_REG 0x64 /* Status register (R) */
20#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
21
22/* Status Register Bits */
23
24#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26
27static int kbd_exists;
28
29/*
30 * Check if the keyboard controller has a keypress for us.
31 * Some parts (Enter Release, LED change) are still blocking polled here,
32 * but hopefully they are all short.
33 */
34int kdb_get_kbd_char(void)
35{
36 int scancode, scanstatus;
37 static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
38 static int shift_key; /* Shift next keypress */
39 static int ctrl_key;
40 u_short keychar;
41
42 if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
43 (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
44 kbd_exists = 0;
45 return -1;
46 }
47 kbd_exists = 1;
48
49 if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
50 return -1;
51
52 /*
53 * Fetch the scancode
54 */
55 scancode = inb(KBD_DATA_REG);
56 scanstatus = inb(KBD_STATUS_REG);
57
58 /*
59 * Ignore mouse events.
60 */
61 if (scanstatus & KBD_STAT_MOUSE_OBF)
62 return -1;
63
64 /*
65 * Ignore release, trigger on make
66 * (except for shift keys, where we want to
67 * keep the shift state so long as the key is
68 * held down).
69 */
70
71 if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
72 /*
73 * Next key may use shift table
74 */
75 if ((scancode & 0x80) == 0)
76 shift_key = 1;
77 else
78 shift_key = 0;
79 return -1;
80 }
81
82 if ((scancode&0x7f) == 0x1d) {
83 /*
84 * Left ctrl key
85 */
86 if ((scancode & 0x80) == 0)
87 ctrl_key = 1;
88 else
89 ctrl_key = 0;
90 return -1;
91 }
92
93 if ((scancode & 0x80) != 0)
94 return -1;
95
96 scancode &= 0x7f;
97
98 /*
99 * Translate scancode
100 */
101
102 if (scancode == 0x3a) {
103 /*
104 * Toggle caps lock
105 */
106 shift_lock ^= 1;
107
108#ifdef KDB_BLINK_LED
109 kdb_toggleled(0x4);
110#endif
111 return -1;
112 }
113
114 if (scancode == 0x0e) {
115 /*
116 * Backspace
117 */
118 return 8;
119 }
120
121 /* Special Key */
122 switch (scancode) {
123 case 0xF: /* Tab */
124 return 9;
125 case 0x53: /* Del */
126 return 4;
127 case 0x47: /* Home */
128 return 1;
129 case 0x4F: /* End */
130 return 5;
131 case 0x4B: /* Left */
132 return 2;
133 case 0x48: /* Up */
134 return 16;
135 case 0x50: /* Down */
136 return 14;
137 case 0x4D: /* Right */
138 return 6;
139 }
140
141 if (scancode == 0xe0)
142 return -1;
143
144 /*
145 * For Japanese 86/106 keyboards
146 * See comment in drivers/char/pc_keyb.c.
147 * - Masahiro Adegawa
148 */
149 if (scancode == 0x73)
150 scancode = 0x59;
151 else if (scancode == 0x7d)
152 scancode = 0x7c;
153
154 if (!shift_lock && !shift_key && !ctrl_key) {
155 keychar = plain_map[scancode];
156 } else if ((shift_lock || shift_key) && key_maps[1]) {
157 keychar = key_maps[1][scancode];
158 } else if (ctrl_key && key_maps[4]) {
159 keychar = key_maps[4][scancode];
160 } else {
161 keychar = 0x0020;
162 kdb_printf("Unknown state/scancode (%d)\n", scancode);
163 }
164 keychar &= 0x0fff;
165 if (keychar == '\t')
166 keychar = ' ';
167 switch (KTYP(keychar)) {
168 case KT_LETTER:
169 case KT_LATIN:
170 if (isprint(keychar))
171 break; /* printable characters */
172 /* drop through */
173 case KT_SPEC:
174 if (keychar == K_ENTER)
175 break;
176 /* drop through */
177 default:
178 return -1; /* ignore unprintables */
179 }
180
181 if ((scancode & 0x7f) == 0x1c) {
182 /*
183 * enter key. All done. Absorb the release scancode.
184 */
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ;
187
188 /*
189 * Fetch the scancode
190 */
191 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG);
193
194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
195 scancode = inb(KBD_DATA_REG);
196 scanstatus = inb(KBD_STATUS_REG);
197 }
198
199 if (scancode != 0x9c) {
200 /*
201 * Wasn't an enter-release, why not?
202 */
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
204 scancode, scanstatus);
205 }
206
207 return 13;
208 }
209
210 return keychar & 0xff;
211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..ebe4a287419e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2846 @@
1/*
2 * Kernel Debugger Architecture Independent Main Code
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
10 * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
11 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
12 */
13
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/reboot.h>
18#include <linux/sched.h>
19#include <linux/sysrq.h>
20#include <linux/smp.h>
21#include <linux/utsname.h>
22#include <linux/vmalloc.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/kallsyms.h>
27#include <linux/kgdb.h>
28#include <linux/kdb.h>
29#include <linux/notifier.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/nmi.h>
33#include <linux/time.h>
34#include <linux/ptrace.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/kdebug.h>
38#include <linux/proc_fs.h>
39#include <linux/uaccess.h>
40#include <linux/slab.h>
41#include "kdb_private.h"
42
43#define GREP_LEN 256
44char kdb_grep_string[GREP_LEN];
45int kdb_grepping_flag;
46EXPORT_SYMBOL(kdb_grepping_flag);
47int kdb_grep_leading;
48int kdb_grep_trailing;
49
50/*
51 * Kernel debugger state flags
52 */
53int kdb_flags;
54atomic_t kdb_event;
55
56/*
57 * kdb_lock protects updates to kdb_initial_cpu. Used to
58 * single thread processors through the kernel debugger.
59 */
60int kdb_initial_cpu = -1; /* cpu number that owns kdb */
61int kdb_nextline = 1;
62int kdb_state; /* General KDB state */
63
64struct task_struct *kdb_current_task;
65EXPORT_SYMBOL(kdb_current_task);
66struct pt_regs *kdb_current_regs;
67
68const char *kdb_diemsg;
69static int kdb_go_count;
70#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
71static unsigned int kdb_continue_catastrophic =
72 CONFIG_KDB_CONTINUE_CATASTROPHIC;
73#else
74static unsigned int kdb_continue_catastrophic;
75#endif
76
77/* kdb_commands describes the available commands. */
78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
86
87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */
89 char *km_msg; /* Corresponding message text */
90} kdbmsg_t;
91
92#define KDBMSG(msgnum, text) \
93 { KDB_##msgnum, text }
94
95static kdbmsg_t kdbmsgs[] = {
96 KDBMSG(NOTFOUND, "Command Not Found"),
97 KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
98 KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
99 "8 is only allowed on 64 bit systems"),
100 KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
101 KDBMSG(NOTENV, "Cannot find environment variable"),
102 KDBMSG(NOENVVALUE, "Environment variable should have value"),
103 KDBMSG(NOTIMP, "Command not implemented"),
104 KDBMSG(ENVFULL, "Environment full"),
105 KDBMSG(ENVBUFFULL, "Environment buffer full"),
106 KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
107#ifdef CONFIG_CPU_XSCALE
108 KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
109#else
110 KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
111#endif
112 KDBMSG(DUPBPT, "Duplicate breakpoint address"),
113 KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
114 KDBMSG(BADMODE, "Invalid IDMODE"),
115 KDBMSG(BADINT, "Illegal numeric value"),
116 KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
117 KDBMSG(BADREG, "Invalid register name"),
118 KDBMSG(BADCPUNUM, "Invalid cpu number"),
119 KDBMSG(BADLENGTH, "Invalid length field"),
120 KDBMSG(NOBP, "No Breakpoint exists"),
121 KDBMSG(BADADDR, "Invalid address"),
122};
123#undef KDBMSG
124
125static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
126
127
128/*
129 * Initial environment. This is all kept static and local to
130 * this file. We don't want to rely on the memory allocation
131 * mechanisms in the kernel, so we use a very limited allocate-only
132 * heap for new and altered environment variables. The entire
133 * environment is limited to a fixed number of entries (add more
134 * to __env[] if required) and a fixed amount of heap (add more to
135 * KDB_ENVBUFSIZE if required).
136 */
137
138static char *__env[] = {
139#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else
143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif
146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30",
151 "NOSECT=1",
152 (char *)0,
153 (char *)0,
154 (char *)0,
155 (char *)0,
156 (char *)0,
157 (char *)0,
158 (char *)0,
159 (char *)0,
160 (char *)0,
161 (char *)0,
162 (char *)0,
163 (char *)0,
164 (char *)0,
165 (char *)0,
166 (char *)0,
167 (char *)0,
168 (char *)0,
169 (char *)0,
170 (char *)0,
171 (char *)0,
172 (char *)0,
173 (char *)0,
174 (char *)0,
175};
176
177static const int __nenv = (sizeof(__env) / sizeof(char *));
178
179struct task_struct *kdb_curr_task(int cpu)
180{
181 struct task_struct *p = curr_task(cpu);
182#ifdef _TIF_MCA_INIT
183 if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
184 p = krp->p;
185#endif
186 return p;
187}
188
189/*
190 * kdbgetenv - This function will return the character string value of
191 * an environment variable.
192 * Parameters:
193 * match A character string representing an environment variable.
194 * Returns:
195 * NULL No environment variable matches 'match'
196 * char* Pointer to string value of environment variable.
197 */
198char *kdbgetenv(const char *match)
199{
200 char **ep = __env;
201 int matchlen = strlen(match);
202 int i;
203
204 for (i = 0; i < __nenv; i++) {
205 char *e = *ep++;
206
207 if (!e)
208 continue;
209
210 if ((strncmp(match, e, matchlen) == 0)
211 && ((e[matchlen] == '\0')
212 || (e[matchlen] == '='))) {
213 char *cp = strchr(e, '=');
214 return cp ? ++cp : "";
215 }
216 }
217 return NULL;
218}
219
220/*
221 * kdballocenv - This function is used to allocate bytes for
222 * environment entries.
223 * Parameters:
224 * match A character string representing a numeric value
225 * Outputs:
226 * *value the unsigned long representation of the env variable 'match'
227 * Returns:
228 * Zero on success, a kdb diagnostic on failure.
229 * Remarks:
230 * We use a static environment buffer (envbuffer) to hold the values
231 * of dynamically generated environment variables (see kdb_set). Buffer
232 * space once allocated is never free'd, so over time, the amount of space
233 * (currently 512 bytes) will be exhausted if env variables are changed
234 * frequently.
235 */
236static char *kdballocenv(size_t bytes)
237{
238#define KDB_ENVBUFSIZE 512
239 static char envbuffer[KDB_ENVBUFSIZE];
240 static int envbufsize;
241 char *ep = NULL;
242
243 if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
244 ep = &envbuffer[envbufsize];
245 envbufsize += bytes;
246 }
247 return ep;
248}
249
250/*
251 * kdbgetulenv - This function will return the value of an unsigned
252 * long-valued environment variable.
253 * Parameters:
254 * match A character string representing a numeric value
255 * Outputs:
256 * *value the unsigned long represntation of the env variable 'match'
257 * Returns:
258 * Zero on success, a kdb diagnostic on failure.
259 */
260static int kdbgetulenv(const char *match, unsigned long *value)
261{
262 char *ep;
263
264 ep = kdbgetenv(match);
265 if (!ep)
266 return KDB_NOTENV;
267 if (strlen(ep) == 0)
268 return KDB_NOENVVALUE;
269
270 *value = simple_strtoul(ep, NULL, 0);
271
272 return 0;
273}
274
275/*
276 * kdbgetintenv - This function will return the value of an
277 * integer-valued environment variable.
278 * Parameters:
279 * match A character string representing an integer-valued env variable
280 * Outputs:
281 * *value the integer representation of the environment variable 'match'
282 * Returns:
283 * Zero on success, a kdb diagnostic on failure.
284 */
285int kdbgetintenv(const char *match, int *value)
286{
287 unsigned long val;
288 int diag;
289
290 diag = kdbgetulenv(match, &val);
291 if (!diag)
292 *value = (int) val;
293 return diag;
294}
295
296/*
297 * kdbgetularg - This function will convert a numeric string into an
298 * unsigned long value.
299 * Parameters:
300 * arg A character string representing a numeric value
301 * Outputs:
302 * *value the unsigned long represntation of arg.
303 * Returns:
304 * Zero on success, a kdb diagnostic on failure.
305 */
306int kdbgetularg(const char *arg, unsigned long *value)
307{
308 char *endp;
309 unsigned long val;
310
311 val = simple_strtoul(arg, &endp, 0);
312
313 if (endp == arg) {
314 /*
315 * Try base 16, for us folks too lazy to type the
316 * leading 0x...
317 */
318 val = simple_strtoul(arg, &endp, 16);
319 if (endp == arg)
320 return KDB_BADINT;
321 }
322
323 *value = val;
324
325 return 0;
326}
327
328/*
329 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one.
331 */
332int kdb_set(int argc, const char **argv)
333{
334 int i;
335 char *ep;
336 size_t varlen, vallen;
337
338 /*
339 * we can be invoked two ways:
340 * set var=value argv[1]="var", argv[2]="value"
341 * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
342 * - if the latter, shift 'em down.
343 */
344 if (argc == 3) {
345 argv[2] = argv[3];
346 argc--;
347 }
348
349 if (argc != 2)
350 return KDB_ARGCOUNT;
351
352 /*
353 * Check for internal variables
354 */
355 if (strcmp(argv[1], "KDBDEBUG") == 0) {
356 unsigned int debugflags;
357 char *cp;
358
359 debugflags = simple_strtoul(argv[2], &cp, 0);
360 if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
361 kdb_printf("kdb: illegal debug flags '%s'\n",
362 argv[2]);
363 return 0;
364 }
365 kdb_flags = (kdb_flags &
366 ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
367 | (debugflags << KDB_DEBUG_FLAG_SHIFT);
368
369 return 0;
370 }
371
372 /*
373 * Tokenizer squashed the '=' sign. argv[1] is variable
374 * name, argv[2] = value.
375 */
376 varlen = strlen(argv[1]);
377 vallen = strlen(argv[2]);
378 ep = kdballocenv(varlen + vallen + 2);
379 if (ep == (char *)0)
380 return KDB_ENVBUFFULL;
381
382 sprintf(ep, "%s=%s", argv[1], argv[2]);
383
384 ep[varlen+vallen+1] = '\0';
385
386 for (i = 0; i < __nenv; i++) {
387 if (__env[i]
388 && ((strncmp(__env[i], argv[1], varlen) == 0)
389 && ((__env[i][varlen] == '\0')
390 || (__env[i][varlen] == '=')))) {
391 __env[i] = ep;
392 return 0;
393 }
394 }
395
396 /*
397 * Wasn't existing variable. Fit into slot.
398 */
399 for (i = 0; i < __nenv-1; i++) {
400 if (__env[i] == (char *)0) {
401 __env[i] = ep;
402 return 0;
403 }
404 }
405
406 return KDB_ENVFULL;
407}
408
409static int kdb_check_regs(void)
410{
411 if (!kdb_current_regs) {
412 kdb_printf("No current kdb registers."
413 " You may need to select another task\n");
414 return KDB_BADREG;
415 }
416 return 0;
417}
418
419/*
420 * kdbgetaddrarg - This function is responsible for parsing an
421 * address-expression and returning the value of the expression,
422 * symbol name, and offset to the caller.
423 *
424 * The argument may consist of a numeric value (decimal or
425 * hexidecimal), a symbol name, a register name (preceeded by the
426 * percent sign), an environment variable with a numeric value
427 * (preceeded by a dollar sign) or a simple arithmetic expression
428 * consisting of a symbol name, +/-, and a numeric constant value
429 * (offset).
430 * Parameters:
431 * argc - count of arguments in argv
432 * argv - argument vector
433 * *nextarg - index to next unparsed argument in argv[]
434 * regs - Register state at time of KDB entry
435 * Outputs:
436 * *value - receives the value of the address-expression
437 * *offset - receives the offset specified, if any
438 * *name - receives the symbol name, if any
439 * *nextarg - index to next unparsed argument in argv[]
440 * Returns:
441 * zero is returned on success, a kdb diagnostic code is
442 * returned on error.
443 */
444int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
445 unsigned long *value, long *offset,
446 char **name)
447{
448 unsigned long addr;
449 unsigned long off = 0;
450 int positive;
451 int diag;
452 int found = 0;
453 char *symname;
454 char symbol = '\0';
455 char *cp;
456 kdb_symtab_t symtab;
457
458 /*
459 * Process arguments which follow the following syntax:
460 *
461 * symbol | numeric-address [+/- numeric-offset]
462 * %register
463 * $environment-variable
464 */
465
466 if (*nextarg > argc)
467 return KDB_ARGCOUNT;
468
469 symname = (char *)argv[*nextarg];
470
471 /*
472 * If there is no whitespace between the symbol
473 * or address and the '+' or '-' symbols, we
474 * remember the character and replace it with a
475 * null so the symbol/value can be properly parsed
476 */
477 cp = strpbrk(symname, "+-");
478 if (cp != NULL) {
479 symbol = *cp;
480 *cp++ = '\0';
481 }
482
483 if (symname[0] == '$') {
484 diag = kdbgetulenv(&symname[1], &addr);
485 if (diag)
486 return diag;
487 } else if (symname[0] == '%') {
488 diag = kdb_check_regs();
489 if (diag)
490 return diag;
491 /* Implement register values with % at a later time as it is
492 * arch optional.
493 */
494 return KDB_NOTIMP;
495 } else {
496 found = kdbgetsymval(symname, &symtab);
497 if (found) {
498 addr = symtab.sym_start;
499 } else {
500 diag = kdbgetularg(argv[*nextarg], &addr);
501 if (diag)
502 return diag;
503 }
504 }
505
506 if (!found)
507 found = kdbnearsym(addr, &symtab);
508
509 (*nextarg)++;
510
511 if (name)
512 *name = symname;
513 if (value)
514 *value = addr;
515 if (offset && name && *name)
516 *offset = addr - symtab.sym_start;
517
518 if ((*nextarg > argc)
519 && (symbol == '\0'))
520 return 0;
521
522 /*
523 * check for +/- and offset
524 */
525
526 if (symbol == '\0') {
527 if ((argv[*nextarg][0] != '+')
528 && (argv[*nextarg][0] != '-')) {
529 /*
530 * Not our argument. Return.
531 */
532 return 0;
533 } else {
534 positive = (argv[*nextarg][0] == '+');
535 (*nextarg)++;
536 }
537 } else
538 positive = (symbol == '+');
539
540 /*
541 * Now there must be an offset!
542 */
543 if ((*nextarg > argc)
544 && (symbol == '\0')) {
545 return KDB_INVADDRFMT;
546 }
547
548 if (!symbol) {
549 cp = (char *)argv[*nextarg];
550 (*nextarg)++;
551 }
552
553 diag = kdbgetularg(cp, &off);
554 if (diag)
555 return diag;
556
557 if (!positive)
558 off = -off;
559
560 if (offset)
561 *offset += off;
562
563 if (value)
564 *value += off;
565
566 return 0;
567}
568
569static void kdb_cmderror(int diag)
570{
571 int i;
572
573 if (diag >= 0) {
574 kdb_printf("no error detected (diagnostic is %d)\n", diag);
575 return;
576 }
577
578 for (i = 0; i < __nkdb_err; i++) {
579 if (kdbmsgs[i].km_diag == diag) {
580 kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
581 return;
582 }
583 }
584
585 kdb_printf("Unknown diag %d\n", -diag);
586}
587
588/*
589 * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
590 * command which defines one command as a set of other commands,
591 * terminated by endefcmd. kdb_defcmd processes the initial
592 * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
593 * the following commands until 'endefcmd'.
594 * Inputs:
595 * argc argument count
596 * argv argument vector
597 * Returns:
598 * zero for success, a kdb diagnostic if error
599 */
600struct defcmd_set {
601 int count;
602 int usable;
603 char *name;
604 char *usage;
605 char *help;
606 char **command;
607};
608static struct defcmd_set *defcmd_set;
609static int defcmd_set_count;
610static int defcmd_in_progress;
611
612/* Forward references */
613static int kdb_exec_defcmd(int argc, const char **argv);
614
615static int kdb_defcmd2(const char *cmdstr, const char *argv0)
616{
617 struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
618 char **save_command = s->command;
619 if (strcmp(argv0, "endefcmd") == 0) {
620 defcmd_in_progress = 0;
621 if (!s->count)
622 s->usable = 0;
623 if (s->usable)
624 kdb_register(s->name, kdb_exec_defcmd,
625 s->usage, s->help, 0);
626 return 0;
627 }
628 if (!s->usable)
629 return KDB_NOTIMP;
630 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
631 if (!s->command) {
632 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
633 cmdstr);
634 s->usable = 0;
635 return KDB_NOTIMP;
636 }
637 memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
638 s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
639 kfree(save_command);
640 return 0;
641}
642
643static int kdb_defcmd(int argc, const char **argv)
644{
645 struct defcmd_set *save_defcmd_set = defcmd_set, *s;
646 if (defcmd_in_progress) {
647 kdb_printf("kdb: nested defcmd detected, assuming missing "
648 "endefcmd\n");
649 kdb_defcmd2("endefcmd", "endefcmd");
650 }
651 if (argc == 0) {
652 int i;
653 for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
654 kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
655 s->usage, s->help);
656 for (i = 0; i < s->count; ++i)
657 kdb_printf("%s", s->command[i]);
658 kdb_printf("endefcmd\n");
659 }
660 return 0;
661 }
662 if (argc != 3)
663 return KDB_ARGCOUNT;
664 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
665 GFP_KDB);
666 if (!defcmd_set) {
667 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
668 argv[1]);
669 defcmd_set = save_defcmd_set;
670 return KDB_NOTIMP;
671 }
672 memcpy(defcmd_set, save_defcmd_set,
673 defcmd_set_count * sizeof(*defcmd_set));
674 kfree(save_defcmd_set);
675 s = defcmd_set + defcmd_set_count;
676 memset(s, 0, sizeof(*s));
677 s->usable = 1;
678 s->name = kdb_strdup(argv[1], GFP_KDB);
679 s->usage = kdb_strdup(argv[2], GFP_KDB);
680 s->help = kdb_strdup(argv[3], GFP_KDB);
681 if (s->usage[0] == '"') {
682 strcpy(s->usage, s->usage+1);
683 s->usage[strlen(s->usage)-1] = '\0';
684 }
685 if (s->help[0] == '"') {
686 strcpy(s->help, s->help+1);
687 s->help[strlen(s->help)-1] = '\0';
688 }
689 ++defcmd_set_count;
690 defcmd_in_progress = 1;
691 return 0;
692}
693
694/*
695 * kdb_exec_defcmd - Execute the set of commands associated with this
696 * defcmd name.
697 * Inputs:
698 * argc argument count
699 * argv argument vector
700 * Returns:
701 * zero for success, a kdb diagnostic if error
702 */
703static int kdb_exec_defcmd(int argc, const char **argv)
704{
705 int i, ret;
706 struct defcmd_set *s;
707 if (argc != 0)
708 return KDB_ARGCOUNT;
709 for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
710 if (strcmp(s->name, argv[0]) == 0)
711 break;
712 }
713 if (i == defcmd_set_count) {
714 kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
715 argv[0]);
716 return KDB_NOTIMP;
717 }
718 for (i = 0; i < s->count; ++i) {
719 /* Recursive use of kdb_parse, do not use argv after
720 * this point */
721 argv = NULL;
722 kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
723 ret = kdb_parse(s->command[i]);
724 if (ret)
725 return ret;
726 }
727 return 0;
728}
729
730/* Command history */
731#define KDB_CMD_HISTORY_COUNT 32
732#define CMD_BUFLEN 200 /* kdb_printf: max printline
733 * size == 256 */
734static unsigned int cmd_head, cmd_tail;
735static unsigned int cmdptr;
736static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
737static char cmd_cur[CMD_BUFLEN];
738
739/*
740 * The "str" argument may point to something like | grep xyz
741 */
742static void parse_grep(const char *str)
743{
744 int len;
745 char *cp = (char *)str, *cp2;
746
747 /* sanity check: we should have been called with the \ first */
748 if (*cp != '|')
749 return;
750 cp++;
751 while (isspace(*cp))
752 cp++;
753 if (strncmp(cp, "grep ", 5)) {
754 kdb_printf("invalid 'pipe', see grephelp\n");
755 return;
756 }
757 cp += 5;
758 while (isspace(*cp))
759 cp++;
760 cp2 = strchr(cp, '\n');
761 if (cp2)
762 *cp2 = '\0'; /* remove the trailing newline */
763 len = strlen(cp);
764 if (len == 0) {
765 kdb_printf("invalid 'pipe', see grephelp\n");
766 return;
767 }
768 /* now cp points to a nonzero length search string */
769 if (*cp == '"') {
770 /* allow it be "x y z" by removing the "'s - there must
771 be two of them */
772 cp++;
773 cp2 = strchr(cp, '"');
774 if (!cp2) {
775 kdb_printf("invalid quoted string, see grephelp\n");
776 return;
777 }
778 *cp2 = '\0'; /* end the string where the 2nd " was */
779 }
780 kdb_grep_leading = 0;
781 if (*cp == '^') {
782 kdb_grep_leading = 1;
783 cp++;
784 }
785 len = strlen(cp);
786 kdb_grep_trailing = 0;
787 if (*(cp+len-1) == '$') {
788 kdb_grep_trailing = 1;
789 *(cp+len-1) = '\0';
790 }
791 len = strlen(cp);
792 if (!len)
793 return;
794 if (len >= GREP_LEN) {
795 kdb_printf("search string too long\n");
796 return;
797 }
798 strcpy(kdb_grep_string, cp);
799 kdb_grepping_flag++;
800 return;
801}
802
803/*
804 * kdb_parse - Parse the command line, search the command table for a
805 * matching command and invoke the command function. This
806 * function may be called recursively, if it is, the second call
807 * will overwrite argv and cbuf. It is the caller's
808 * responsibility to save their argv if they recursively call
809 * kdb_parse().
810 * Parameters:
811 * cmdstr The input command line to be parsed.
812 * regs The registers at the time kdb was entered.
813 * Returns:
814 * Zero for success, a kdb diagnostic if failure.
815 * Remarks:
816 * Limited to 20 tokens.
817 *
818 * Real rudimentary tokenization. Basically only whitespace
819 * is considered a token delimeter (but special consideration
820 * is taken of the '=' sign as used by the 'set' command).
821 *
822 * The algorithm used to tokenize the input string relies on
823 * there being at least one whitespace (or otherwise useless)
824 * character between tokens as the character immediately following
825 * the token is altered in-place to a null-byte to terminate the
826 * token string.
827 */
828
829#define MAXARGC 20
830
831int kdb_parse(const char *cmdstr)
832{
833 static char *argv[MAXARGC];
834 static int argc;
835 static char cbuf[CMD_BUFLEN+2];
836 char *cp;
837 char *cpp, quoted;
838 kdbtab_t *tp;
839 int i, escaped, ignore_errors = 0, check_grep;
840
841 /*
842 * First tokenize the command string.
843 */
844 cp = (char *)cmdstr;
845 kdb_grepping_flag = check_grep = 0;
846
847 if (KDB_FLAG(CMD_INTERRUPT)) {
848 /* Previous command was interrupted, newline must not
849 * repeat the command */
850 KDB_FLAG_CLEAR(CMD_INTERRUPT);
851 KDB_STATE_SET(PAGER);
852 argc = 0; /* no repeat */
853 }
854
855 if (*cp != '\n' && *cp != '\0') {
856 argc = 0;
857 cpp = cbuf;
858 while (*cp) {
859 /* skip whitespace */
860 while (isspace(*cp))
861 cp++;
862 if ((*cp == '\0') || (*cp == '\n') ||
863 (*cp == '#' && !defcmd_in_progress))
864 break;
865 /* special case: check for | grep pattern */
866 if (*cp == '|') {
867 check_grep++;
868 break;
869 }
870 if (cpp >= cbuf + CMD_BUFLEN) {
871 kdb_printf("kdb_parse: command buffer "
872 "overflow, command ignored\n%s\n",
873 cmdstr);
874 return KDB_NOTFOUND;
875 }
876 if (argc >= MAXARGC - 1) {
877 kdb_printf("kdb_parse: too many arguments, "
878 "command ignored\n%s\n", cmdstr);
879 return KDB_NOTFOUND;
880 }
881 argv[argc++] = cpp;
882 escaped = 0;
883 quoted = '\0';
884 /* Copy to next unquoted and unescaped
885 * whitespace or '=' */
886 while (*cp && *cp != '\n' &&
887 (escaped || quoted || !isspace(*cp))) {
888 if (cpp >= cbuf + CMD_BUFLEN)
889 break;
890 if (escaped) {
891 escaped = 0;
892 *cpp++ = *cp++;
893 continue;
894 }
895 if (*cp == '\\') {
896 escaped = 1;
897 ++cp;
898 continue;
899 }
900 if (*cp == quoted)
901 quoted = '\0';
902 else if (*cp == '\'' || *cp == '"')
903 quoted = *cp;
904 *cpp = *cp++;
905 if (*cpp == '=' && !quoted)
906 break;
907 ++cpp;
908 }
909 *cpp++ = '\0'; /* Squash a ws or '=' character */
910 }
911 }
912 if (!argc)
913 return 0;
914 if (check_grep)
915 parse_grep(cp);
916 if (defcmd_in_progress) {
917 int result = kdb_defcmd2(cmdstr, argv[0]);
918 if (!defcmd_in_progress) {
919 argc = 0; /* avoid repeat on endefcmd */
920 *(argv[0]) = '\0';
921 }
922 return result;
923 }
924 if (argv[0][0] == '-' && argv[0][1] &&
925 (argv[0][1] < '0' || argv[0][1] > '9')) {
926 ignore_errors = 1;
927 ++argv[0];
928 }
929
930 for_each_kdbcmd(tp, i) {
931 if (tp->cmd_name) {
932 /*
933 * If this command is allowed to be abbreviated,
934 * check to see if this is it.
935 */
936
937 if (tp->cmd_minlen
938 && (strlen(argv[0]) <= tp->cmd_minlen)) {
939 if (strncmp(argv[0],
940 tp->cmd_name,
941 tp->cmd_minlen) == 0) {
942 break;
943 }
944 }
945
946 if (strcmp(argv[0], tp->cmd_name) == 0)
947 break;
948 }
949 }
950
951 /*
952 * If we don't find a command by this name, see if the first
953 * few characters of this match any of the known commands.
954 * e.g., md1c20 should match md.
955 */
956 if (i == kdb_max_commands) {
957 for_each_kdbcmd(tp, i) {
958 if (tp->cmd_name) {
959 if (strncmp(argv[0],
960 tp->cmd_name,
961 strlen(tp->cmd_name)) == 0) {
962 break;
963 }
964 }
965 }
966 }
967
968 if (i < kdb_max_commands) {
969 int result;
970 KDB_STATE_SET(CMD);
971 result = (*tp->cmd_func)(argc-1, (const char **)argv);
972 if (result && ignore_errors && result > KDB_CMD_GO)
973 result = 0;
974 KDB_STATE_CLEAR(CMD);
975 switch (tp->cmd_repeat) {
976 case KDB_REPEAT_NONE:
977 argc = 0;
978 if (argv[0])
979 *(argv[0]) = '\0';
980 break;
981 case KDB_REPEAT_NO_ARGS:
982 argc = 1;
983 if (argv[1])
984 *(argv[1]) = '\0';
985 break;
986 case KDB_REPEAT_WITH_ARGS:
987 break;
988 }
989 return result;
990 }
991
992 /*
993 * If the input with which we were presented does not
994 * map to an existing command, attempt to parse it as an
995 * address argument and display the result. Useful for
996 * obtaining the address of a variable, or the nearest symbol
997 * to an address contained in a register.
998 */
999 {
1000 unsigned long value;
1001 char *name = NULL;
1002 long offset;
1003 int nextarg = 0;
1004
1005 if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
1006 &value, &offset, &name)) {
1007 return KDB_NOTFOUND;
1008 }
1009
1010 kdb_printf("%s = ", argv[0]);
1011 kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
1012 kdb_printf("\n");
1013 return 0;
1014 }
1015}
1016
1017
1018static int handle_ctrl_cmd(char *cmd)
1019{
1020#define CTRL_P 16
1021#define CTRL_N 14
1022
1023 /* initial situation */
1024 if (cmd_head == cmd_tail)
1025 return 0;
1026 switch (*cmd) {
1027 case CTRL_P:
1028 if (cmdptr != cmd_tail)
1029 cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
1030 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1031 return 1;
1032 case CTRL_N:
1033 if (cmdptr != cmd_head)
1034 cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
1035 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1036 return 1;
1037 }
1038 return 0;
1039}
1040
1041/*
1042 * kdb_reboot - This function implements the 'reboot' command. Reboot
1043 * the system immediately, or loop for ever on failure.
1044 */
1045static int kdb_reboot(int argc, const char **argv)
1046{
1047 emergency_restart();
1048 kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
1049 while (1)
1050 cpu_relax();
1051 /* NOTREACHED */
1052 return 0;
1053}
1054
1055static void kdb_dumpregs(struct pt_regs *regs)
1056{
1057 int old_lvl = console_loglevel;
1058 console_loglevel = 15;
1059 kdb_trap_printk++;
1060 show_regs(regs);
1061 kdb_trap_printk--;
1062 kdb_printf("\n");
1063 console_loglevel = old_lvl;
1064}
1065
1066void kdb_set_current_task(struct task_struct *p)
1067{
1068 kdb_current_task = p;
1069
1070 if (kdb_task_has_cpu(p)) {
1071 kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
1072 return;
1073 }
1074 kdb_current_regs = NULL;
1075}
1076
1077/*
1078 * kdb_local - The main code for kdb. This routine is invoked on a
1079 * specific processor, it is not global. The main kdb() routine
1080 * ensures that only one processor at a time is in this routine.
1081 * This code is called with the real reason code on the first
1082 * entry to a kdb session, thereafter it is called with reason
1083 * SWITCH, even if the user goes back to the original cpu.
1084 * Inputs:
1085 * reason The reason KDB was invoked
1086 * error The hardware-defined error code
1087 * regs The exception frame at time of fault/breakpoint.
1088 * db_result Result code from the break or debug point.
1089 * Returns:
1090 * 0 KDB was invoked for an event which it wasn't responsible
1091 * 1 KDB handled the event for which it was invoked.
1092 * KDB_CMD_GO User typed 'go'.
1093 * KDB_CMD_CPU User switched to another cpu.
1094 * KDB_CMD_SS Single step.
1095 * KDB_CMD_SSB Single step until branch.
1096 */
1097static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1098 kdb_dbtrap_t db_result)
1099{
1100 char *cmdbuf;
1101 int diag;
1102 struct task_struct *kdb_current =
1103 kdb_curr_task(raw_smp_processor_id());
1104
1105 KDB_DEBUG_STATE("kdb_local 1", reason);
1106 kdb_go_count = 0;
1107 if (reason == KDB_REASON_DEBUG) {
1108 /* special case below */
1109 } else {
1110 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1111 kdb_current, kdb_current->pid);
1112#if defined(CONFIG_SMP)
1113 kdb_printf("on processor %d ", raw_smp_processor_id());
1114#endif
1115 }
1116
1117 switch (reason) {
1118 case KDB_REASON_DEBUG:
1119 {
1120 /*
1121 * If re-entering kdb after a single step
1122 * command, don't print the message.
1123 */
1124 switch (db_result) {
1125 case KDB_DB_BPT:
1126 kdb_printf("\nEntering kdb (0x%p, pid %d) ",
1127 kdb_current, kdb_current->pid);
1128#if defined(CONFIG_SMP)
1129 kdb_printf("on processor %d ", raw_smp_processor_id());
1130#endif
1131 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1132 instruction_pointer(regs));
1133 break;
1134 case KDB_DB_SSB:
1135 /*
1136 * In the midst of ssb command. Just return.
1137 */
1138 KDB_DEBUG_STATE("kdb_local 3", reason);
1139 return KDB_CMD_SSB; /* Continue with SSB command */
1140
1141 break;
1142 case KDB_DB_SS:
1143 break;
1144 case KDB_DB_SSBPT:
1145 KDB_DEBUG_STATE("kdb_local 4", reason);
1146 return 1; /* kdba_db_trap did the work */
1147 default:
1148 kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
1149 db_result);
1150 break;
1151 }
1152
1153 }
1154 break;
1155 case KDB_REASON_ENTER:
1156 if (KDB_STATE(KEYBOARD))
1157 kdb_printf("due to Keyboard Entry\n");
1158 else
1159 kdb_printf("due to KDB_ENTER()\n");
1160 break;
1161 case KDB_REASON_KEYBOARD:
1162 KDB_STATE_SET(KEYBOARD);
1163 kdb_printf("due to Keyboard Entry\n");
1164 break;
1165 case KDB_REASON_ENTER_SLAVE:
1166 /* drop through, slaves only get released via cpu switch */
1167 case KDB_REASON_SWITCH:
1168 kdb_printf("due to cpu switch\n");
1169 break;
1170 case KDB_REASON_OOPS:
1171 kdb_printf("Oops: %s\n", kdb_diemsg);
1172 kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
1173 instruction_pointer(regs));
1174 kdb_dumpregs(regs);
1175 break;
1176 case KDB_REASON_NMI:
1177 kdb_printf("due to NonMaskable Interrupt @ "
1178 kdb_machreg_fmt "\n",
1179 instruction_pointer(regs));
1180 kdb_dumpregs(regs);
1181 break;
1182 case KDB_REASON_SSTEP:
1183 case KDB_REASON_BREAK:
1184 kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
1185 reason == KDB_REASON_BREAK ?
1186 "Breakpoint" : "SS trap", instruction_pointer(regs));
1187 /*
1188 * Determine if this breakpoint is one that we
1189 * are interested in.
1190 */
1191 if (db_result != KDB_DB_BPT) {
1192 kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
1193 db_result);
1194 KDB_DEBUG_STATE("kdb_local 6", reason);
1195 return 0; /* Not for us, dismiss it */
1196 }
1197 break;
1198 case KDB_REASON_RECURSE:
1199 kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
1200 instruction_pointer(regs));
1201 break;
1202 default:
1203 kdb_printf("kdb: unexpected reason code: %d\n", reason);
1204 KDB_DEBUG_STATE("kdb_local 8", reason);
1205 return 0; /* Not for us, dismiss it */
1206 }
1207
1208 while (1) {
1209 /*
1210 * Initialize pager context.
1211 */
1212 kdb_nextline = 1;
1213 KDB_STATE_CLEAR(SUPPRESS);
1214
1215 cmdbuf = cmd_cur;
1216 *cmdbuf = '\0';
1217 *(cmd_hist[cmd_head]) = '\0';
1218
1219 if (KDB_FLAG(ONLY_DO_DUMP)) {
1220 /* kdb is off but a catastrophic error requires a dump.
1221 * Take the dump and reboot.
1222 * Turn on logging so the kdb output appears in the log
1223 * buffer in the dump.
1224 */
1225 const char *setargs[] = { "set", "LOGGING", "1" };
1226 kdb_set(2, setargs);
1227 kdb_reboot(0, NULL);
1228 /*NOTREACHED*/
1229 }
1230
1231do_full_getstr:
1232#if defined(CONFIG_SMP)
1233 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
1234 raw_smp_processor_id());
1235#else
1236 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
1237#endif
1238 if (defcmd_in_progress)
1239 strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
1240
1241 /*
1242 * Fetch command from keyboard
1243 */
1244 cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
1245 if (*cmdbuf != '\n') {
1246 if (*cmdbuf < 32) {
1247 if (cmdptr == cmd_head) {
1248 strncpy(cmd_hist[cmd_head], cmd_cur,
1249 CMD_BUFLEN);
1250 *(cmd_hist[cmd_head] +
1251 strlen(cmd_hist[cmd_head])-1) = '\0';
1252 }
1253 if (!handle_ctrl_cmd(cmdbuf))
1254 *(cmd_cur+strlen(cmd_cur)-1) = '\0';
1255 cmdbuf = cmd_cur;
1256 goto do_full_getstr;
1257 } else {
1258 strncpy(cmd_hist[cmd_head], cmd_cur,
1259 CMD_BUFLEN);
1260 }
1261
1262 cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
1263 if (cmd_head == cmd_tail)
1264 cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
1265 }
1266
1267 cmdptr = cmd_head;
1268 diag = kdb_parse(cmdbuf);
1269 if (diag == KDB_NOTFOUND) {
1270 kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
1271 diag = 0;
1272 }
1273 if (diag == KDB_CMD_GO
1274 || diag == KDB_CMD_CPU
1275 || diag == KDB_CMD_SS
1276 || diag == KDB_CMD_SSB
1277 || diag == KDB_CMD_KGDB)
1278 break;
1279
1280 if (diag)
1281 kdb_cmderror(diag);
1282 }
1283 KDB_DEBUG_STATE("kdb_local 9", diag);
1284 return diag;
1285}
1286
1287
1288/*
1289 * kdb_print_state - Print the state data for the current processor
1290 * for debugging.
1291 * Inputs:
1292 * text Identifies the debug point
1293 * value Any integer value to be printed, e.g. reason code.
1294 */
1295void kdb_print_state(const char *text, int value)
1296{
1297 kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
1298 text, raw_smp_processor_id(), value, kdb_initial_cpu,
1299 kdb_state);
1300}
1301
1302/*
1303 * kdb_main_loop - After initial setup and assignment of the
1304 * controlling cpu, all cpus are in this loop. One cpu is in
1305 * control and will issue the kdb prompt, the others will spin
1306 * until 'go' or cpu switch.
1307 *
1308 * To get a consistent view of the kernel stacks for all
1309 * processes, this routine is invoked from the main kdb code via
1310 * an architecture specific routine. kdba_main_loop is
1311 * responsible for making the kernel stacks consistent for all
1312 * processes, there should be no difference between a blocked
1313 * process and a running process as far as kdb is concerned.
1314 * Inputs:
1315 * reason The reason KDB was invoked
1316 * error The hardware-defined error code
1317 * reason2 kdb's current reason code.
1318 * Initially error but can change
1319 * acording to kdb state.
1320 * db_result Result code from break or debug point.
1321 * regs The exception frame at time of fault/breakpoint.
1322 * should always be valid.
1323 * Returns:
1324 * 0 KDB was invoked for an event which it wasn't responsible
1325 * 1 KDB handled the event for which it was invoked.
1326 */
1327int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1328 kdb_dbtrap_t db_result, struct pt_regs *regs)
1329{
1330 int result = 1;
1331 /* Stay in kdb() until 'go', 'ss[b]' or an error */
1332 while (1) {
1333 /*
1334 * All processors except the one that is in control
1335 * will spin here.
1336 */
1337 KDB_DEBUG_STATE("kdb_main_loop 1", reason);
1338 while (KDB_STATE(HOLD_CPU)) {
1339 /* state KDB is turned off by kdb_cpu to see if the
1340 * other cpus are still live, each cpu in this loop
1341 * turns it back on.
1342 */
1343 if (!KDB_STATE(KDB))
1344 KDB_STATE_SET(KDB);
1345 }
1346
1347 KDB_STATE_CLEAR(SUPPRESS);
1348 KDB_DEBUG_STATE("kdb_main_loop 2", reason);
1349 if (KDB_STATE(LEAVING))
1350 break; /* Another cpu said 'go' */
1351 /* Still using kdb, this processor is in control */
1352 result = kdb_local(reason2, error, regs, db_result);
1353 KDB_DEBUG_STATE("kdb_main_loop 3", result);
1354
1355 if (result == KDB_CMD_CPU)
1356 break;
1357
1358 if (result == KDB_CMD_SS) {
1359 KDB_STATE_SET(DOING_SS);
1360 break;
1361 }
1362
1363 if (result == KDB_CMD_SSB) {
1364 KDB_STATE_SET(DOING_SS);
1365 KDB_STATE_SET(DOING_SSB);
1366 break;
1367 }
1368
1369 if (result == KDB_CMD_KGDB) {
1370 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
1371 kdb_printf("Entering please attach debugger "
1372 "or use $D#44+ or $3#33\n");
1373 break;
1374 }
1375 if (result && result != 1 && result != KDB_CMD_GO)
1376 kdb_printf("\nUnexpected kdb_local return code %d\n",
1377 result);
1378 KDB_DEBUG_STATE("kdb_main_loop 4", reason);
1379 break;
1380 }
1381 if (KDB_STATE(DOING_SS))
1382 KDB_STATE_CLEAR(SSBPT);
1383
1384 return result;
1385}
1386
1387/*
1388 * kdb_mdr - This function implements the guts of the 'mdr', memory
1389 * read command.
1390 * mdr <addr arg>,<byte count>
1391 * Inputs:
1392 * addr Start address
1393 * count Number of bytes
1394 * Returns:
1395 * Always 0. Any errors are detected and printed by kdb_getarea.
1396 */
1397static int kdb_mdr(unsigned long addr, unsigned int count)
1398{
1399 unsigned char c;
1400 while (count--) {
1401 if (kdb_getarea(c, addr))
1402 return 0;
1403 kdb_printf("%02x", c);
1404 addr++;
1405 }
1406 kdb_printf("\n");
1407 return 0;
1408}
1409
1410/*
1411 * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
1412 * 'md8' 'mdr' and 'mds' commands.
1413 *
1414 * md|mds [<addr arg> [<line count> [<radix>]]]
1415 * mdWcN [<addr arg> [<line count> [<radix>]]]
1416 * where W = is the width (1, 2, 4 or 8) and N is the count.
1417 * for eg., md1c20 reads 20 bytes, 1 at a time.
1418 * mdr <addr arg>,<byte count>
1419 */
1420static void kdb_md_line(const char *fmtstr, unsigned long addr,
1421 int symbolic, int nosect, int bytesperword,
1422 int num, int repeat, int phys)
1423{
1424 /* print just one line of data */
1425 kdb_symtab_t symtab;
1426 char cbuf[32];
1427 char *c = cbuf;
1428 int i;
1429 unsigned long word;
1430
1431 memset(cbuf, '\0', sizeof(cbuf));
1432 if (phys)
1433 kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
1434 else
1435 kdb_printf(kdb_machreg_fmt0 " ", addr);
1436
1437 for (i = 0; i < num && repeat--; i++) {
1438 if (phys) {
1439 if (kdb_getphysword(&word, addr, bytesperword))
1440 break;
1441 } else if (kdb_getword(&word, addr, bytesperword))
1442 break;
1443 kdb_printf(fmtstr, word);
1444 if (symbolic)
1445 kdbnearsym(word, &symtab);
1446 else
1447 memset(&symtab, 0, sizeof(symtab));
1448 if (symtab.sym_name) {
1449 kdb_symbol_print(word, &symtab, 0);
1450 if (!nosect) {
1451 kdb_printf("\n");
1452 kdb_printf(" %s %s "
1453 kdb_machreg_fmt " "
1454 kdb_machreg_fmt " "
1455 kdb_machreg_fmt, symtab.mod_name,
1456 symtab.sec_name, symtab.sec_start,
1457 symtab.sym_start, symtab.sym_end);
1458 }
1459 addr += bytesperword;
1460 } else {
1461 union {
1462 u64 word;
1463 unsigned char c[8];
1464 } wc;
1465 unsigned char *cp;
1466#ifdef __BIG_ENDIAN
1467 cp = wc.c + 8 - bytesperword;
1468#else
1469 cp = wc.c;
1470#endif
1471 wc.word = word;
1472#define printable_char(c) \
1473 ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
1474 switch (bytesperword) {
1475 case 8:
1476 *c++ = printable_char(*cp++);
1477 *c++ = printable_char(*cp++);
1478 *c++ = printable_char(*cp++);
1479 *c++ = printable_char(*cp++);
1480 addr += 4;
1481 case 4:
1482 *c++ = printable_char(*cp++);
1483 *c++ = printable_char(*cp++);
1484 addr += 2;
1485 case 2:
1486 *c++ = printable_char(*cp++);
1487 addr++;
1488 case 1:
1489 *c++ = printable_char(*cp++);
1490 addr++;
1491 break;
1492 }
1493#undef printable_char
1494 }
1495 }
1496 kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
1497 " ", cbuf);
1498}
1499
1500static int kdb_md(int argc, const char **argv)
1501{
1502 static unsigned long last_addr;
1503 static int last_radix, last_bytesperword, last_repeat;
1504 int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
1505 int nosect = 0;
1506 char fmtchar, fmtstr[64];
1507 unsigned long addr;
1508 unsigned long word;
1509 long offset = 0;
1510 int symbolic = 0;
1511 int valid = 0;
1512 int phys = 0;
1513
1514 kdbgetintenv("MDCOUNT", &mdcount);
1515 kdbgetintenv("RADIX", &radix);
1516 kdbgetintenv("BYTESPERWORD", &bytesperword);
1517
1518 /* Assume 'md <addr>' and start with environment values */
1519 repeat = mdcount * 16 / bytesperword;
1520
1521 if (strcmp(argv[0], "mdr") == 0) {
1522 if (argc != 2)
1523 return KDB_ARGCOUNT;
1524 valid = 1;
1525 } else if (isdigit(argv[0][2])) {
1526 bytesperword = (int)(argv[0][2] - '0');
1527 if (bytesperword == 0) {
1528 bytesperword = last_bytesperword;
1529 if (bytesperword == 0)
1530 bytesperword = 4;
1531 }
1532 last_bytesperword = bytesperword;
1533 repeat = mdcount * 16 / bytesperword;
1534 if (!argv[0][3])
1535 valid = 1;
1536 else if (argv[0][3] == 'c' && argv[0][4]) {
1537 char *p;
1538 repeat = simple_strtoul(argv[0] + 4, &p, 10);
1539 mdcount = ((repeat * bytesperword) + 15) / 16;
1540 valid = !*p;
1541 }
1542 last_repeat = repeat;
1543 } else if (strcmp(argv[0], "md") == 0)
1544 valid = 1;
1545 else if (strcmp(argv[0], "mds") == 0)
1546 valid = 1;
1547 else if (strcmp(argv[0], "mdp") == 0) {
1548 phys = valid = 1;
1549 }
1550 if (!valid)
1551 return KDB_NOTFOUND;
1552
1553 if (argc == 0) {
1554 if (last_addr == 0)
1555 return KDB_ARGCOUNT;
1556 addr = last_addr;
1557 radix = last_radix;
1558 bytesperword = last_bytesperword;
1559 repeat = last_repeat;
1560 mdcount = ((repeat * bytesperword) + 15) / 16;
1561 }
1562
1563 if (argc) {
1564 unsigned long val;
1565 int diag, nextarg = 1;
1566 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
1567 &offset, NULL);
1568 if (diag)
1569 return diag;
1570 if (argc > nextarg+2)
1571 return KDB_ARGCOUNT;
1572
1573 if (argc >= nextarg) {
1574 diag = kdbgetularg(argv[nextarg], &val);
1575 if (!diag) {
1576 mdcount = (int) val;
1577 repeat = mdcount * 16 / bytesperword;
1578 }
1579 }
1580 if (argc >= nextarg+1) {
1581 diag = kdbgetularg(argv[nextarg+1], &val);
1582 if (!diag)
1583 radix = (int) val;
1584 }
1585 }
1586
1587 if (strcmp(argv[0], "mdr") == 0)
1588 return kdb_mdr(addr, mdcount);
1589
1590 switch (radix) {
1591 case 10:
1592 fmtchar = 'd';
1593 break;
1594 case 16:
1595 fmtchar = 'x';
1596 break;
1597 case 8:
1598 fmtchar = 'o';
1599 break;
1600 default:
1601 return KDB_BADRADIX;
1602 }
1603
1604 last_radix = radix;
1605
1606 if (bytesperword > KDB_WORD_SIZE)
1607 return KDB_BADWIDTH;
1608
1609 switch (bytesperword) {
1610 case 8:
1611 sprintf(fmtstr, "%%16.16l%c ", fmtchar);
1612 break;
1613 case 4:
1614 sprintf(fmtstr, "%%8.8l%c ", fmtchar);
1615 break;
1616 case 2:
1617 sprintf(fmtstr, "%%4.4l%c ", fmtchar);
1618 break;
1619 case 1:
1620 sprintf(fmtstr, "%%2.2l%c ", fmtchar);
1621 break;
1622 default:
1623 return KDB_BADWIDTH;
1624 }
1625
1626 last_repeat = repeat;
1627 last_bytesperword = bytesperword;
1628
1629 if (strcmp(argv[0], "mds") == 0) {
1630 symbolic = 1;
1631 /* Do not save these changes as last_*, they are temporary mds
1632 * overrides.
1633 */
1634 bytesperword = KDB_WORD_SIZE;
1635 repeat = mdcount;
1636 kdbgetintenv("NOSECT", &nosect);
1637 }
1638
1639 /* Round address down modulo BYTESPERWORD */
1640
1641 addr &= ~(bytesperword-1);
1642
1643 while (repeat > 0) {
1644 unsigned long a;
1645 int n, z, num = (symbolic ? 1 : (16 / bytesperword));
1646
1647 if (KDB_FLAG(CMD_INTERRUPT))
1648 return 0;
1649 for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
1650 if (phys) {
1651 if (kdb_getphysword(&word, a, bytesperword)
1652 || word)
1653 break;
1654 } else if (kdb_getword(&word, a, bytesperword) || word)
1655 break;
1656 }
1657 n = min(num, repeat);
1658 kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
1659 num, repeat, phys);
1660 addr += bytesperword * n;
1661 repeat -= n;
1662 z = (z + num - 1) / num;
1663 if (z > 2) {
1664 int s = num * (z-2);
1665 kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
1666 " zero suppressed\n",
1667 addr, addr + bytesperword * s - 1);
1668 addr += bytesperword * s;
1669 repeat -= s;
1670 }
1671 }
1672 last_addr = addr;
1673
1674 return 0;
1675}
1676
1677/*
1678 * kdb_mm - This function implements the 'mm' command.
1679 * mm address-expression new-value
1680 * Remarks:
1681 * mm works on machine words, mmW works on bytes.
1682 */
1683static int kdb_mm(int argc, const char **argv)
1684{
1685 int diag;
1686 unsigned long addr;
1687 long offset = 0;
1688 unsigned long contents;
1689 int nextarg;
1690 int width;
1691
1692 if (argv[0][2] && !isdigit(argv[0][2]))
1693 return KDB_NOTFOUND;
1694
1695 if (argc < 2)
1696 return KDB_ARGCOUNT;
1697
1698 nextarg = 1;
1699 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1700 if (diag)
1701 return diag;
1702
1703 if (nextarg > argc)
1704 return KDB_ARGCOUNT;
1705 diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
1706 if (diag)
1707 return diag;
1708
1709 if (nextarg != argc + 1)
1710 return KDB_ARGCOUNT;
1711
1712 width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
1713 diag = kdb_putword(addr, contents, width);
1714 if (diag)
1715 return diag;
1716
1717 kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
1718
1719 return 0;
1720}
1721
1722/*
1723 * kdb_go - This function implements the 'go' command.
1724 * go [address-expression]
1725 */
1726static int kdb_go(int argc, const char **argv)
1727{
1728 unsigned long addr;
1729 int diag;
1730 int nextarg;
1731 long offset;
1732
1733 if (argc == 1) {
1734 if (raw_smp_processor_id() != kdb_initial_cpu) {
1735 kdb_printf("go <address> must be issued from the "
1736 "initial cpu, do cpu %d first\n",
1737 kdb_initial_cpu);
1738 return KDB_ARGCOUNT;
1739 }
1740 nextarg = 1;
1741 diag = kdbgetaddrarg(argc, argv, &nextarg,
1742 &addr, &offset, NULL);
1743 if (diag)
1744 return diag;
1745 } else if (argc) {
1746 return KDB_ARGCOUNT;
1747 }
1748
1749 diag = KDB_CMD_GO;
1750 if (KDB_FLAG(CATASTROPHIC)) {
1751 kdb_printf("Catastrophic error detected\n");
1752 kdb_printf("kdb_continue_catastrophic=%d, ",
1753 kdb_continue_catastrophic);
1754 if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
1755 kdb_printf("type go a second time if you really want "
1756 "to continue\n");
1757 return 0;
1758 }
1759 if (kdb_continue_catastrophic == 2) {
1760 kdb_printf("forcing reboot\n");
1761 kdb_reboot(0, NULL);
1762 }
1763 kdb_printf("attempting to continue\n");
1764 }
1765 return diag;
1766}
1767
1768/*
1769 * kdb_rd - This function implements the 'rd' command.
1770 */
1771static int kdb_rd(int argc, const char **argv)
1772{
1773 int diag = kdb_check_regs();
1774 if (diag)
1775 return diag;
1776
1777 kdb_dumpregs(kdb_current_regs);
1778 return 0;
1779}
1780
1781/*
1782 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents
1784 * Remarks:
1785 * Currently doesn't allow modification of control or
1786 * debug registers.
1787 */
1788static int kdb_rm(int argc, const char **argv)
1789{
1790 int diag;
1791 int ind = 0;
1792 unsigned long contents;
1793
1794 if (argc != 2)
1795 return KDB_ARGCOUNT;
1796 /*
1797 * Allow presence or absence of leading '%' symbol.
1798 */
1799 if (argv[1][0] == '%')
1800 ind = 1;
1801
1802 diag = kdbgetularg(argv[2], &contents);
1803 if (diag)
1804 return diag;
1805
1806 diag = kdb_check_regs();
1807 if (diag)
1808 return diag;
1809 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0;
1811}
1812
1813#if defined(CONFIG_MAGIC_SYSRQ)
1814/*
1815 * kdb_sr - This function implements the 'sr' (SYSRQ key) command
1816 * which interfaces to the soi-disant MAGIC SYSRQ functionality.
1817 * sr <magic-sysrq-code>
1818 */
1819static int kdb_sr(int argc, const char **argv)
1820{
1821 if (argc != 1)
1822 return KDB_ARGCOUNT;
1823 kdb_trap_printk++;
1824 __handle_sysrq(*argv[1], NULL, 0);
1825 kdb_trap_printk--;
1826
1827 return 0;
1828}
1829#endif /* CONFIG_MAGIC_SYSRQ */
1830
1831/*
1832 * kdb_ef - This function implements the 'regs' (display exception
1833 * frame) command. This command takes an address and expects to
1834 * find an exception frame at that address, formats and prints
1835 * it.
1836 * regs address-expression
1837 * Remarks:
1838 * Not done yet.
1839 */
1840static int kdb_ef(int argc, const char **argv)
1841{
1842 int diag;
1843 unsigned long addr;
1844 long offset;
1845 int nextarg;
1846
1847 if (argc != 1)
1848 return KDB_ARGCOUNT;
1849
1850 nextarg = 1;
1851 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1852 if (diag)
1853 return diag;
1854 show_regs((struct pt_regs *)addr);
1855 return 0;
1856}
1857
1858#if defined(CONFIG_MODULES)
1859/*
1860 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1861 * currently loaded kernel modules.
1862 * Mostly taken from userland lsmod.
1863 */
1864static int kdb_lsmod(int argc, const char **argv)
1865{
1866 struct module *mod;
1867
1868 if (argc != 0)
1869 return KDB_ARGCOUNT;
1870
1871 kdb_printf("Module Size modstruct Used by\n");
1872 list_for_each_entry(mod, kdb_modules, list) {
1873
1874 kdb_printf("%-20s%8u 0x%p ", mod->name,
1875 mod->core_size, (void *)mod);
1876#ifdef CONFIG_MODULE_UNLOAD
1877 kdb_printf("%4d ", module_refcount(mod));
1878#endif
1879 if (mod->state == MODULE_STATE_GOING)
1880 kdb_printf(" (Unloading)");
1881 else if (mod->state == MODULE_STATE_COMING)
1882 kdb_printf(" (Loading)");
1883 else
1884 kdb_printf(" (Live)");
1885 kdb_printf(" 0x%p", mod->module_core);
1886
1887#ifdef CONFIG_MODULE_UNLOAD
1888 {
1889 struct module_use *use;
1890 kdb_printf(" [ ");
1891 list_for_each_entry(use, &mod->source_list,
1892 source_list)
1893 kdb_printf("%s ", use->target->name);
1894 kdb_printf("]\n");
1895 }
1896#endif
1897 }
1898
1899 return 0;
1900}
1901
1902#endif /* CONFIG_MODULES */
1903
1904/*
1905 * kdb_env - This function implements the 'env' command. Display the
1906 * current environment variables.
1907 */
1908
1909static int kdb_env(int argc, const char **argv)
1910{
1911 int i;
1912
1913 for (i = 0; i < __nenv; i++) {
1914 if (__env[i])
1915 kdb_printf("%s\n", __env[i]);
1916 }
1917
1918 if (KDB_DEBUG(MASK))
1919 kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
1920
1921 return 0;
1922}
1923
1924#ifdef CONFIG_PRINTK
1925/*
1926 * kdb_dmesg - This function implements the 'dmesg' command to display
1927 * the contents of the syslog buffer.
1928 * dmesg [lines] [adjust]
1929 */
1930static int kdb_dmesg(int argc, const char **argv)
1931{
1932 char *syslog_data[4], *start, *end, c = '\0', *p;
1933 int diag, logging, logsize, lines = 0, adjust = 0, n;
1934
1935 if (argc > 2)
1936 return KDB_ARGCOUNT;
1937 if (argc) {
1938 char *cp;
1939 lines = simple_strtol(argv[1], &cp, 0);
1940 if (*cp)
1941 lines = 0;
1942 if (argc > 1) {
1943 adjust = simple_strtoul(argv[2], &cp, 0);
1944 if (*cp || adjust < 0)
1945 adjust = 0;
1946 }
1947 }
1948
1949 /* disable LOGGING if set */
1950 diag = kdbgetintenv("LOGGING", &logging);
1951 if (!diag && logging) {
1952 const char *setargs[] = { "set", "LOGGING", "0" };
1953 kdb_set(2, setargs);
1954 }
1955
1956 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
1957 * logical start, end+1. */
1958 kdb_syslog_data(syslog_data);
1959 if (syslog_data[2] == syslog_data[3])
1960 return 0;
1961 logsize = syslog_data[1] - syslog_data[0];
1962 start = syslog_data[2];
1963 end = syslog_data[3];
1964#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
1965 for (n = 0, p = start; p < end; ++p) {
1966 c = *KDB_WRAP(p);
1967 if (c == '\n')
1968 ++n;
1969 }
1970 if (c != '\n')
1971 ++n;
1972 if (lines < 0) {
1973 if (adjust >= n)
1974 kdb_printf("buffer only contains %d lines, nothing "
1975 "printed\n", n);
1976 else if (adjust - lines >= n)
1977 kdb_printf("buffer only contains %d lines, last %d "
1978 "lines printed\n", n, n - adjust);
1979 if (adjust) {
1980 for (; start < end && adjust; ++start) {
1981 if (*KDB_WRAP(start) == '\n')
1982 --adjust;
1983 }
1984 if (start < end)
1985 ++start;
1986 }
1987 for (p = start; p < end && lines; ++p) {
1988 if (*KDB_WRAP(p) == '\n')
1989 ++lines;
1990 }
1991 end = p;
1992 } else if (lines > 0) {
1993 int skip = n - (adjust + lines);
1994 if (adjust >= n) {
1995 kdb_printf("buffer only contains %d lines, "
1996 "nothing printed\n", n);
1997 skip = n;
1998 } else if (skip < 0) {
1999 lines += skip;
2000 skip = 0;
2001 kdb_printf("buffer only contains %d lines, first "
2002 "%d lines printed\n", n, lines);
2003 }
2004 for (; start < end && skip; ++start) {
2005 if (*KDB_WRAP(start) == '\n')
2006 --skip;
2007 }
2008 for (p = start; p < end && lines; ++p) {
2009 if (*KDB_WRAP(p) == '\n')
2010 --lines;
2011 }
2012 end = p;
2013 }
2014 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2015 c = '\n';
2016 while (start != end) {
2017 char buf[201];
2018 p = buf;
2019 if (KDB_FLAG(CMD_INTERRUPT))
2020 return 0;
2021 while (start < end && (c = *KDB_WRAP(start)) &&
2022 (p - buf) < sizeof(buf)-1) {
2023 ++start;
2024 *p++ = c;
2025 if (c == '\n')
2026 break;
2027 }
2028 *p = '\0';
2029 kdb_printf("%s", buf);
2030 }
2031 if (c != '\n')
2032 kdb_printf("\n");
2033
2034 return 0;
2035}
2036#endif /* CONFIG_PRINTK */
2037/*
2038 * kdb_cpu - This function implements the 'cpu' command.
2039 * cpu [<cpunum>]
2040 * Returns:
2041 * KDB_CMD_CPU for success, a kdb diagnostic if error
2042 */
2043static void kdb_cpu_status(void)
2044{
2045 int i, start_cpu, first_print = 1;
2046 char state, prev_state = '?';
2047
2048 kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
2049 kdb_printf("Available cpus: ");
2050 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2051 if (!cpu_online(i)) {
2052 state = 'F'; /* cpu is offline */
2053 } else {
2054 state = ' '; /* cpu is responding to kdb */
2055 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
2056 state = 'I'; /* idle task */
2057 }
2058 if (state != prev_state) {
2059 if (prev_state != '?') {
2060 if (!first_print)
2061 kdb_printf(", ");
2062 first_print = 0;
2063 kdb_printf("%d", start_cpu);
2064 if (start_cpu < i-1)
2065 kdb_printf("-%d", i-1);
2066 if (prev_state != ' ')
2067 kdb_printf("(%c)", prev_state);
2068 }
2069 prev_state = state;
2070 start_cpu = i;
2071 }
2072 }
2073 /* print the trailing cpus, ignoring them if they are all offline */
2074 if (prev_state != 'F') {
2075 if (!first_print)
2076 kdb_printf(", ");
2077 kdb_printf("%d", start_cpu);
2078 if (start_cpu < i-1)
2079 kdb_printf("-%d", i-1);
2080 if (prev_state != ' ')
2081 kdb_printf("(%c)", prev_state);
2082 }
2083 kdb_printf("\n");
2084}
2085
2086static int kdb_cpu(int argc, const char **argv)
2087{
2088 unsigned long cpunum;
2089 int diag;
2090
2091 if (argc == 0) {
2092 kdb_cpu_status();
2093 return 0;
2094 }
2095
2096 if (argc != 1)
2097 return KDB_ARGCOUNT;
2098
2099 diag = kdbgetularg(argv[1], &cpunum);
2100 if (diag)
2101 return diag;
2102
2103 /*
2104 * Validate cpunum
2105 */
2106 if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
2107 return KDB_BADCPUNUM;
2108
2109 dbg_switch_cpu = cpunum;
2110
2111 /*
2112 * Switch to other cpu
2113 */
2114 return KDB_CMD_CPU;
2115}
2116
2117/* The user may not realize that ps/bta with no parameters does not print idle
2118 * or sleeping system daemon processes, so tell them how many were suppressed.
2119 */
2120void kdb_ps_suppressed(void)
2121{
2122 int idle = 0, daemon = 0;
2123 unsigned long mask_I = kdb_task_state_string("I"),
2124 mask_M = kdb_task_state_string("M");
2125 unsigned long cpu;
2126 const struct task_struct *p, *g;
2127 for_each_online_cpu(cpu) {
2128 p = kdb_curr_task(cpu);
2129 if (kdb_task_state(p, mask_I))
2130 ++idle;
2131 }
2132 kdb_do_each_thread(g, p) {
2133 if (kdb_task_state(p, mask_M))
2134 ++daemon;
2135 } kdb_while_each_thread(g, p);
2136 if (idle || daemon) {
2137 if (idle)
2138 kdb_printf("%d idle process%s (state I)%s\n",
2139 idle, idle == 1 ? "" : "es",
2140 daemon ? " and " : "");
2141 if (daemon)
2142 kdb_printf("%d sleeping system daemon (state M) "
2143 "process%s", daemon,
2144 daemon == 1 ? "" : "es");
2145 kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
2146 }
2147}
2148
2149/*
2150 * kdb_ps - This function implements the 'ps' command which shows a
2151 * list of the active processes.
2152 * ps [DRSTCZEUIMA] All processes, optionally filtered by state
2153 */
2154void kdb_ps1(const struct task_struct *p)
2155{
2156 int cpu;
2157 unsigned long tmp;
2158
2159 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
2160 return;
2161
2162 cpu = kdb_process_cpu(p);
2163 kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
2164 (void *)p, p->pid, p->parent->pid,
2165 kdb_task_has_cpu(p), kdb_process_cpu(p),
2166 kdb_task_state_char(p),
2167 (void *)(&p->thread),
2168 p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
2169 p->comm);
2170 if (kdb_task_has_cpu(p)) {
2171 if (!KDB_TSK(cpu)) {
2172 kdb_printf(" Error: no saved data for this cpu\n");
2173 } else {
2174 if (KDB_TSK(cpu) != p)
2175 kdb_printf(" Error: does not match running "
2176 "process table (0x%p)\n", KDB_TSK(cpu));
2177 }
2178 }
2179}
2180
2181static int kdb_ps(int argc, const char **argv)
2182{
2183 struct task_struct *g, *p;
2184 unsigned long mask, cpu;
2185
2186 if (argc == 0)
2187 kdb_ps_suppressed();
2188 kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
2189 (int)(2*sizeof(void *))+2, "Task Addr",
2190 (int)(2*sizeof(void *))+2, "Thread");
2191 mask = kdb_task_state_string(argc ? argv[1] : NULL);
2192 /* Run the active tasks first */
2193 for_each_online_cpu(cpu) {
2194 if (KDB_FLAG(CMD_INTERRUPT))
2195 return 0;
2196 p = kdb_curr_task(cpu);
2197 if (kdb_task_state(p, mask))
2198 kdb_ps1(p);
2199 }
2200 kdb_printf("\n");
2201 /* Now the real tasks */
2202 kdb_do_each_thread(g, p) {
2203 if (KDB_FLAG(CMD_INTERRUPT))
2204 return 0;
2205 if (kdb_task_state(p, mask))
2206 kdb_ps1(p);
2207 } kdb_while_each_thread(g, p);
2208
2209 return 0;
2210}
2211
2212/*
2213 * kdb_pid - This function implements the 'pid' command which switches
2214 * the currently active process.
2215 * pid [<pid> | R]
2216 */
2217static int kdb_pid(int argc, const char **argv)
2218{
2219 struct task_struct *p;
2220 unsigned long val;
2221 int diag;
2222
2223 if (argc > 1)
2224 return KDB_ARGCOUNT;
2225
2226 if (argc) {
2227 if (strcmp(argv[1], "R") == 0) {
2228 p = KDB_TSK(kdb_initial_cpu);
2229 } else {
2230 diag = kdbgetularg(argv[1], &val);
2231 if (diag)
2232 return KDB_BADINT;
2233
2234 p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
2235 if (!p) {
2236 kdb_printf("No task with pid=%d\n", (pid_t)val);
2237 return 0;
2238 }
2239 }
2240 kdb_set_current_task(p);
2241 }
2242 kdb_printf("KDB current process is %s(pid=%d)\n",
2243 kdb_current_task->comm,
2244 kdb_current_task->pid);
2245
2246 return 0;
2247}
2248
2249/*
2250 * kdb_ll - This function implements the 'll' command which follows a
2251 * linked list and executes an arbitrary command for each
2252 * element.
2253 */
2254static int kdb_ll(int argc, const char **argv)
2255{
2256 int diag;
2257 unsigned long addr;
2258 long offset = 0;
2259 unsigned long va;
2260 unsigned long linkoffset;
2261 int nextarg;
2262 const char *command;
2263
2264 if (argc != 3)
2265 return KDB_ARGCOUNT;
2266
2267 nextarg = 1;
2268 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2269 if (diag)
2270 return diag;
2271
2272 diag = kdbgetularg(argv[2], &linkoffset);
2273 if (diag)
2274 return diag;
2275
2276 /*
2277 * Using the starting address as
2278 * the first element in the list, and assuming that
2279 * the list ends with a null pointer.
2280 */
2281
2282 va = addr;
2283 command = kdb_strdup(argv[3], GFP_KDB);
2284 if (!command) {
2285 kdb_printf("%s: cannot duplicate command\n", __func__);
2286 return 0;
2287 }
2288 /* Recursive use of kdb_parse, do not use argv after this point */
2289 argv = NULL;
2290
2291 while (va) {
2292 char buf[80];
2293
2294 if (KDB_FLAG(CMD_INTERRUPT))
2295 return 0;
2296
2297 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2298 diag = kdb_parse(buf);
2299 if (diag)
2300 return diag;
2301
2302 addr = va + linkoffset;
2303 if (kdb_getword(&va, addr, sizeof(va)))
2304 return 0;
2305 }
2306 kfree(command);
2307
2308 return 0;
2309}
2310
2311static int kdb_kgdb(int argc, const char **argv)
2312{
2313 return KDB_CMD_KGDB;
2314}
2315
2316/*
2317 * kdb_help - This function implements the 'help' and '?' commands.
2318 */
2319static int kdb_help(int argc, const char **argv)
2320{
2321 kdbtab_t *kt;
2322 int i;
2323
2324 kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
2325 kdb_printf("-----------------------------"
2326 "-----------------------------\n");
2327 for_each_kdbcmd(kt, i) {
2328 if (kt->cmd_name)
2329 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2330 kt->cmd_usage, kt->cmd_help);
2331 if (KDB_FLAG(CMD_INTERRUPT))
2332 return 0;
2333 }
2334 return 0;
2335}
2336
2337/*
2338 * kdb_kill - This function implements the 'kill' commands.
2339 */
2340static int kdb_kill(int argc, const char **argv)
2341{
2342 long sig, pid;
2343 char *endp;
2344 struct task_struct *p;
2345 struct siginfo info;
2346
2347 if (argc != 2)
2348 return KDB_ARGCOUNT;
2349
2350 sig = simple_strtol(argv[1], &endp, 0);
2351 if (*endp)
2352 return KDB_BADINT;
2353 if (sig >= 0) {
2354 kdb_printf("Invalid signal parameter.<-signal>\n");
2355 return 0;
2356 }
2357 sig = -sig;
2358
2359 pid = simple_strtol(argv[2], &endp, 0);
2360 if (*endp)
2361 return KDB_BADINT;
2362 if (pid <= 0) {
2363 kdb_printf("Process ID must be large than 0.\n");
2364 return 0;
2365 }
2366
2367 /* Find the process. */
2368 p = find_task_by_pid_ns(pid, &init_pid_ns);
2369 if (!p) {
2370 kdb_printf("The specified process isn't found.\n");
2371 return 0;
2372 }
2373 p = p->group_leader;
2374 info.si_signo = sig;
2375 info.si_errno = 0;
2376 info.si_code = SI_USER;
2377 info.si_pid = pid; /* same capabilities as process being signalled */
2378 info.si_uid = 0; /* kdb has root authority */
2379 kdb_send_sig_info(p, &info);
2380 return 0;
2381}
2382
2383struct kdb_tm {
2384 int tm_sec; /* seconds */
2385 int tm_min; /* minutes */
2386 int tm_hour; /* hours */
2387 int tm_mday; /* day of the month */
2388 int tm_mon; /* month */
2389 int tm_year; /* year */
2390};
2391
2392static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2393{
2394 /* This will work from 1970-2099, 2100 is not a leap year */
2395 static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
2396 31, 30, 31, 30, 31 };
2397 memset(tm, 0, sizeof(*tm));
2398 tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
2399 tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
2400 (2 * 365 + 1); /* shift base from 1970 to 1968 */
2401 tm->tm_min = tm->tm_sec / 60 % 60;
2402 tm->tm_hour = tm->tm_sec / 60 / 60;
2403 tm->tm_sec = tm->tm_sec % 60;
2404 tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
2405 tm->tm_mday %= (4*365+1);
2406 mon_day[1] = 29;
2407 while (tm->tm_mday >= mon_day[tm->tm_mon]) {
2408 tm->tm_mday -= mon_day[tm->tm_mon];
2409 if (++tm->tm_mon == 12) {
2410 tm->tm_mon = 0;
2411 ++tm->tm_year;
2412 mon_day[1] = 28;
2413 }
2414 }
2415 ++tm->tm_mday;
2416}
2417
2418/*
2419 * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
2420 * I cannot call that code directly from kdb, it has an unconditional
2421 * cli()/sti() and calls routines that take locks which can stop the debugger.
2422 */
2423static void kdb_sysinfo(struct sysinfo *val)
2424{
2425 struct timespec uptime;
2426 do_posix_clock_monotonic_gettime(&uptime);
2427 memset(val, 0, sizeof(*val));
2428 val->uptime = uptime.tv_sec;
2429 val->loads[0] = avenrun[0];
2430 val->loads[1] = avenrun[1];
2431 val->loads[2] = avenrun[2];
2432 val->procs = nr_threads-1;
2433 si_meminfo(val);
2434
2435 return;
2436}
2437
2438/*
2439 * kdb_summary - This function implements the 'summary' command.
2440 */
2441static int kdb_summary(int argc, const char **argv)
2442{
2443 struct kdb_tm tm;
2444 struct sysinfo val;
2445
2446 if (argc)
2447 return KDB_ARGCOUNT;
2448
2449 kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
2450 kdb_printf("release %s\n", init_uts_ns.name.release);
2451 kdb_printf("version %s\n", init_uts_ns.name.version);
2452 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2453 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456
2457 kdb_gmtime(&xtime, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
2461 tm.tm_hour, tm.tm_min, tm.tm_sec,
2462 sys_tz.tz_minuteswest);
2463
2464 kdb_sysinfo(&val);
2465 kdb_printf("uptime ");
2466 if (val.uptime > (24*60*60)) {
2467 int days = val.uptime / (24*60*60);
2468 val.uptime %= (24*60*60);
2469 kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
2470 }
2471 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2472
2473 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2474
2475#define LOAD_INT(x) ((x) >> FSHIFT)
2476#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2477 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2478 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2479 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2480 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2481#undef LOAD_INT
2482#undef LOAD_FRAC
2483 /* Display in kilobytes */
2484#define K(x) ((x) << (PAGE_SHIFT - 10))
2485 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2486 "Buffers: %8lu kB\n",
2487 val.totalram, val.freeram, val.bufferram);
2488 return 0;
2489}
2490
2491/*
2492 * kdb_per_cpu - This function implements the 'per_cpu' command.
2493 */
2494static int kdb_per_cpu(int argc, const char **argv)
2495{
2496 char buf[256], fmtstr[64];
2497 kdb_symtab_t symtab;
2498 cpumask_t suppress = CPU_MASK_NONE;
2499 int cpu, diag;
2500 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2501
2502 if (argc < 1 || argc > 3)
2503 return KDB_ARGCOUNT;
2504
2505 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
2506 if (!kdbgetsymval(buf, &symtab)) {
2507 kdb_printf("%s is not a per_cpu variable\n", argv[1]);
2508 return KDB_BADADDR;
2509 }
2510 if (argc >= 2) {
2511 diag = kdbgetularg(argv[2], &bytesperword);
2512 if (diag)
2513 return diag;
2514 }
2515 if (!bytesperword)
2516 bytesperword = KDB_WORD_SIZE;
2517 else if (bytesperword > KDB_WORD_SIZE)
2518 return KDB_BADWIDTH;
2519 sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
2520 if (argc >= 3) {
2521 diag = kdbgetularg(argv[3], &whichcpu);
2522 if (diag)
2523 return diag;
2524 if (!cpu_online(whichcpu)) {
2525 kdb_printf("cpu %ld is not online\n", whichcpu);
2526 return KDB_BADCPUNUM;
2527 }
2528 }
2529
2530 /* Most architectures use __per_cpu_offset[cpu], some use
2531 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
2532 */
2533#ifdef __per_cpu_offset
2534#define KDB_PCU(cpu) __per_cpu_offset(cpu)
2535#else
2536#ifdef CONFIG_SMP
2537#define KDB_PCU(cpu) __per_cpu_offset[cpu]
2538#else
2539#define KDB_PCU(cpu) 0
2540#endif
2541#endif
2542
2543 for_each_online_cpu(cpu) {
2544 if (whichcpu != ~0UL && whichcpu != cpu)
2545 continue;
2546 addr = symtab.sym_start + KDB_PCU(cpu);
2547 diag = kdb_getword(&val, addr, bytesperword);
2548 if (diag) {
2549 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2550 "read, diag=%d\n", cpu, addr, diag);
2551 continue;
2552 }
2553#ifdef CONFIG_SMP
2554 if (!val) {
2555 cpu_set(cpu, suppress);
2556 continue;
2557 }
2558#endif /* CONFIG_SMP */
2559 kdb_printf("%5d ", cpu);
2560 kdb_md_line(fmtstr, addr,
2561 bytesperword == KDB_WORD_SIZE,
2562 1, bytesperword, 1, 1, 0);
2563 }
2564 if (cpus_weight(suppress) == 0)
2565 return 0;
2566 kdb_printf("Zero suppressed cpu(s):");
2567 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2568 cpu = next_cpu(cpu, suppress)) {
2569 kdb_printf(" %d", cpu);
2570 if (cpu == num_possible_cpus() - 1 ||
2571 next_cpu(cpu, suppress) != cpu + 1)
2572 continue;
2573 while (cpu < num_possible_cpus() &&
2574 next_cpu(cpu, suppress) == cpu + 1)
2575 ++cpu;
2576 kdb_printf("-%d", cpu);
2577 }
2578 kdb_printf("\n");
2579
2580#undef KDB_PCU
2581
2582 return 0;
2583}
2584
2585/*
2586 * display help for the use of cmd | grep pattern
2587 */
2588static int kdb_grep_help(int argc, const char **argv)
2589{
2590 kdb_printf("Usage of cmd args | grep pattern:\n");
2591 kdb_printf(" Any command's output may be filtered through an ");
2592 kdb_printf("emulated 'pipe'.\n");
2593 kdb_printf(" 'grep' is just a key word.\n");
2594 kdb_printf(" The pattern may include a very limited set of "
2595 "metacharacters:\n");
2596 kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
2597 kdb_printf(" And if there are spaces in the pattern, you may "
2598 "quote it:\n");
2599 kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
2600 " or \"^pat tern$\"\n");
2601 return 0;
2602}
2603
2604/*
2605 * kdb_register_repeat - This function is used to register a kernel
2606 * debugger command.
2607 * Inputs:
2608 * cmd Command name
2609 * func Function to execute the command
2610 * usage A simple usage string showing arguments
2611 * help A simple help string describing command
2612 * repeat Does the command auto repeat on enter?
2613 * Returns:
2614 * zero for success, one if a duplicate command.
2615 */
2616#define kdb_command_extend 50 /* arbitrary */
2617int kdb_register_repeat(char *cmd,
2618 kdb_func_t func,
2619 char *usage,
2620 char *help,
2621 short minlen,
2622 kdb_repeat_t repeat)
2623{
2624 int i;
2625 kdbtab_t *kp;
2626
2627 /*
2628 * Brute force method to determine duplicates
2629 */
2630 for_each_kdbcmd(kp, i) {
2631 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2632 kdb_printf("Duplicate kdb command registered: "
2633 "%s, func %p help %s\n", cmd, func, help);
2634 return 1;
2635 }
2636 }
2637
2638 /*
2639 * Insert command into first available location in table
2640 */
2641 for_each_kdbcmd(kp, i) {
2642 if (kp->cmd_name == NULL)
2643 break;
2644 }
2645
2646 if (i >= kdb_max_commands) {
2647 kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
2648 kdb_command_extend) * sizeof(*new), GFP_KDB);
2649 if (!new) {
2650 kdb_printf("Could not allocate new kdb_command "
2651 "table\n");
2652 return 1;
2653 }
2654 if (kdb_commands) {
2655 memcpy(new, kdb_commands,
2656 kdb_max_commands * sizeof(*new));
2657 kfree(kdb_commands);
2658 }
2659 memset(new + kdb_max_commands, 0,
2660 kdb_command_extend * sizeof(*new));
2661 kdb_commands = new;
2662 kp = kdb_commands + kdb_max_commands;
2663 kdb_max_commands += kdb_command_extend;
2664 }
2665
2666 kp->cmd_name = cmd;
2667 kp->cmd_func = func;
2668 kp->cmd_usage = usage;
2669 kp->cmd_help = help;
2670 kp->cmd_flags = 0;
2671 kp->cmd_minlen = minlen;
2672 kp->cmd_repeat = repeat;
2673
2674 return 0;
2675}
2676
2677/*
2678 * kdb_register - Compatibility register function for commands that do
2679 * not need to specify a repeat state. Equivalent to
2680 * kdb_register_repeat with KDB_REPEAT_NONE.
2681 * Inputs:
2682 * cmd Command name
2683 * func Function to execute the command
2684 * usage A simple usage string showing arguments
2685 * help A simple help string describing command
2686 * Returns:
2687 * zero for success, one if a duplicate command.
2688 */
2689int kdb_register(char *cmd,
2690 kdb_func_t func,
2691 char *usage,
2692 char *help,
2693 short minlen)
2694{
2695 return kdb_register_repeat(cmd, func, usage, help, minlen,
2696 KDB_REPEAT_NONE);
2697}
2698
2699/*
2700 * kdb_unregister - This function is used to unregister a kernel
2701 * debugger command. It is generally called when a module which
2702 * implements kdb commands is unloaded.
2703 * Inputs:
2704 * cmd Command name
2705 * Returns:
2706 * zero for success, one command not registered.
2707 */
2708int kdb_unregister(char *cmd)
2709{
2710 int i;
2711 kdbtab_t *kp;
2712
2713 /*
2714 * find the command.
2715 */
2716 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
2717 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2718 kp->cmd_name = NULL;
2719 return 0;
2720 }
2721 }
2722
2723 /* Couldn't find it. */
2724 return 1;
2725}
2726
2727/* Initialize the kdb command table. */
2728static void __init kdb_inittab(void)
2729{
2730 int i;
2731 kdbtab_t *kp;
2732
2733 for_each_kdbcmd(kp, i)
2734 kp->cmd_name = NULL;
2735
2736 kdb_register_repeat("md", kdb_md, "<vaddr>",
2737 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2738 KDB_REPEAT_NO_ARGS);
2739 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
2740 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
2741 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
2742 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
2743 kdb_register_repeat("mds", kdb_md, "<vaddr>",
2744 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
2745 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
2746 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
2747 kdb_register_repeat("go", kdb_go, "[<vaddr>]",
2748 "Continue Execution", 1, KDB_REPEAT_NONE);
2749 kdb_register_repeat("rd", kdb_rd, "",
2750 "Display Registers", 0, KDB_REPEAT_NONE);
2751 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
2752 "Modify Registers", 0, KDB_REPEAT_NONE);
2753 kdb_register_repeat("ef", kdb_ef, "<vaddr>",
2754 "Display exception frame", 0, KDB_REPEAT_NONE);
2755 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
2756 "Stack traceback", 1, KDB_REPEAT_NONE);
2757 kdb_register_repeat("btp", kdb_bt, "<pid>",
2758 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2759 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
2760 "Display stack all processes", 0, KDB_REPEAT_NONE);
2761 kdb_register_repeat("btc", kdb_bt, "",
2762 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2763 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2764 "Backtrace process given its struct task address", 0,
2765 KDB_REPEAT_NONE);
2766 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2767 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2768 kdb_register_repeat("env", kdb_env, "",
2769 "Show environment variables", 0, KDB_REPEAT_NONE);
2770 kdb_register_repeat("set", kdb_set, "",
2771 "Set environment variables", 0, KDB_REPEAT_NONE);
2772 kdb_register_repeat("help", kdb_help, "",
2773 "Display Help Message", 1, KDB_REPEAT_NONE);
2774 kdb_register_repeat("?", kdb_help, "",
2775 "Display Help Message", 0, KDB_REPEAT_NONE);
2776 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
2777 "Switch to new cpu", 0, KDB_REPEAT_NONE);
2778 kdb_register_repeat("kgdb", kdb_kgdb, "",
2779 "Enter kgdb mode", 0, KDB_REPEAT_NONE);
2780 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
2781 "Display active task list", 0, KDB_REPEAT_NONE);
2782 kdb_register_repeat("pid", kdb_pid, "<pidnum>",
2783 "Switch to another task", 0, KDB_REPEAT_NONE);
2784 kdb_register_repeat("reboot", kdb_reboot, "",
2785 "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
2786#if defined(CONFIG_MODULES)
2787 kdb_register_repeat("lsmod", kdb_lsmod, "",
2788 "List loaded kernel modules", 0, KDB_REPEAT_NONE);
2789#endif
2790#if defined(CONFIG_MAGIC_SYSRQ)
2791 kdb_register_repeat("sr", kdb_sr, "<key>",
2792 "Magic SysRq key", 0, KDB_REPEAT_NONE);
2793#endif
2794#if defined(CONFIG_PRINTK)
2795 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2796 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2797#endif
2798 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2799 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2800 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
2801 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2802 kdb_register_repeat("summary", kdb_summary, "",
2803 "Summarize the system", 4, KDB_REPEAT_NONE);
2804 kdb_register_repeat("per_cpu", kdb_per_cpu, "",
2805 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2806 kdb_register_repeat("grephelp", kdb_grep_help, "",
2807 "Display help on | grep", 0, KDB_REPEAT_NONE);
2808}
2809
2810/* Execute any commands defined in kdb_cmds. */
2811static void __init kdb_cmd_init(void)
2812{
2813 int i, diag;
2814 for (i = 0; kdb_cmds[i]; ++i) {
2815 diag = kdb_parse(kdb_cmds[i]);
2816 if (diag)
2817 kdb_printf("kdb command %s failed, kdb diag %d\n",
2818 kdb_cmds[i], diag);
2819 }
2820 if (defcmd_in_progress) {
2821 kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
2822 kdb_parse("endefcmd");
2823 }
2824}
2825
2826/* Intialize kdb_printf, breakpoint tables and kdb state */
2827void __init kdb_init(int lvl)
2828{
2829 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
2830 int i;
2831
2832 if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
2833 return;
2834 for (i = kdb_init_lvl; i < lvl; i++) {
2835 switch (i) {
2836 case KDB_NOT_INITIALIZED:
2837 kdb_inittab(); /* Initialize Command Table */
2838 kdb_initbptab(); /* Initialize Breakpoints */
2839 break;
2840 case KDB_INIT_EARLY:
2841 kdb_cmd_init(); /* Build kdb_cmds tables */
2842 break;
2843 }
2844 }
2845 kdb_init_lvl = lvl;
2846}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..97d3ba69775d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,300 @@
1#ifndef _KDBPRIVATE_H
2#define _KDBPRIVATE_H
3
4/*
5 * Kernel Debugger Architecture Independent Private Headers
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file "COPYING" in the main directory of this archive
9 * for more details.
10 *
11 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
12 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
13 */
14
15#include <linux/kgdb.h>
16#include "../debug_core.h"
17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002)
44#define KDB_CMD_SS (-1003)
45#define KDB_CMD_SSB (-1004)
46#define KDB_CMD_KGDB (-1005)
47#define KDB_CMD_KGDB2 (-1006)
48
49/* Internal debug flags */
50#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
51#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
52#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
53#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
54#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
55#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
56#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
57#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
58
59#define KDB_DEBUG(flag) (kdb_flags & \
60 (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
61#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
62 kdb_print_state(text, value)
63
64#if BITS_PER_LONG == 32
65
66#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
67
68#define kdb_machreg_fmt "0x%lx"
69#define kdb_machreg_fmt0 "0x%08lx"
70#define kdb_bfd_vma_fmt "0x%lx"
71#define kdb_bfd_vma_fmt0 "0x%08lx"
72#define kdb_elfw_addr_fmt "0x%x"
73#define kdb_elfw_addr_fmt0 "0x%08x"
74#define kdb_f_count_fmt "%d"
75
76#elif BITS_PER_LONG == 64
77
78#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
79
80#define kdb_machreg_fmt "0x%lx"
81#define kdb_machreg_fmt0 "0x%016lx"
82#define kdb_bfd_vma_fmt "0x%lx"
83#define kdb_bfd_vma_fmt0 "0x%016lx"
84#define kdb_elfw_addr_fmt "0x%x"
85#define kdb_elfw_addr_fmt0 "0x%016x"
86#define kdb_f_count_fmt "%ld"
87
88#endif
89
90/*
91 * KDB_MAXBPT describes the total number of breakpoints
92 * supported by this architecure.
93 */
94#define KDB_MAXBPT 16
95
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */
110 const char *mod_name; /* Module containing symbol or
111 * "kernel" */
112 unsigned long mod_start;
113 unsigned long mod_end;
114 const char *sec_name; /* Section containing symbol */
115 unsigned long sec_start;
116 unsigned long sec_end;
117 const char *sym_name; /* Full symbol name, including
118 * any version */
119 unsigned long sym_start;
120 unsigned long sym_end;
121 } kdb_symtab_t;
122extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124
125/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t);
133
134/*
135 * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
136 * names, not pointers. The underlying *_size functions take pointers.
137 */
138#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
139#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
140
141extern int kdb_getphysword(unsigned long *word,
142 unsigned long addr, size_t size);
143extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t);
145
146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *);
153extern int kdbnearsym(unsigned long, kdb_symtab_t *);
154extern void kdbnearsym_cleanup(void);
155extern char *kdb_strdup(const char *str, gfp_t type);
156extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
157
158/* Routine for debugging the debugger state. */
159extern void kdb_print_state(const char *, int);
160
161extern int kdb_state;
162#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
163#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
164#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
165#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
166 * kdb control */
167#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
168#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
169#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
170 * DOING_SS is also set */
171#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
172 * after one ss, independent of
173 * DOING_SS */
174#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
175#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
176#define KDB_STATE_PAGER 0x00000400 /* pager is available */
177#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
178 * back to initial cpu */
179#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
180#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
181#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
182#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
183 * adjusted */
184#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
185#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
186 * keyboard on this cpu */
187#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
188#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
189#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
190#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
191#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
192 * specific use */
193
194#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
195#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
196#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
197
198extern int kdb_nextline; /* Current number of lines displayed */
199
200typedef struct _kdb_bp {
201 unsigned long bp_addr; /* Address breakpoint is present at */
202 unsigned int bp_free:1; /* This entry is available */
203 unsigned int bp_enabled:1; /* Breakpoint is active in register */
204 unsigned int bp_type:4; /* Uses hardware register */
205 unsigned int bp_installed:1; /* Breakpoint is installed */
206 unsigned int bp_delay:1; /* Do delayed bp handling */
207 unsigned int bp_delayed:1; /* Delayed breakpoint */
208 unsigned int bph_length; /* HW break length */
209} kdb_bp_t;
210
211#ifdef CONFIG_KGDB_KDB
212extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
213
214/* The KDB shell command table */
215typedef struct _kdbtab {
216 char *cmd_name; /* Command name */
217 kdb_func_t cmd_func; /* Function to execute command */
218 char *cmd_usage; /* Usage String for this command */
219 char *cmd_help; /* Help message for this command */
220 short cmd_flags; /* Parsing flags */
221 short cmd_minlen; /* Minimum legal # command
222 * chars required */
223 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
224} kdbtab_t;
225
226extern int kdb_bt(int, const char **); /* KDB display back trace */
227
228/* KDB breakpoint management functions */
229extern void kdb_initbptab(void);
230extern void kdb_bp_install(struct pt_regs *);
231extern void kdb_bp_remove(void);
232
233typedef enum {
234 KDB_DB_BPT, /* Breakpoint */
235 KDB_DB_SS, /* Single-step trap */
236 KDB_DB_SSB, /* Single step to branch */
237 KDB_DB_SSBPT, /* Single step over breakpoint */
238 KDB_DB_NOBPT /* Spurious breakpoint */
239} kdb_dbtrap_t;
240
241extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
242 int, kdb_dbtrap_t, struct pt_regs *);
243
244/* Miscellaneous functions and data areas */
245extern int kdb_grepping_flag;
246extern char kdb_grep_string[];
247extern int kdb_grep_leading;
248extern int kdb_grep_trailing;
249extern char *kdb_cmds[];
250extern void kdb_syslog_data(char *syslog_data[]);
251extern unsigned long kdb_task_state_string(const char *);
252extern char kdb_task_state_char (const struct task_struct *);
253extern unsigned long kdb_task_state(const struct task_struct *p,
254 unsigned long mask);
255extern void kdb_ps_suppressed(void);
256extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void);
260extern const char *kdb_walk_kallsyms(loff_t *pos);
261extern char *kdb_getstr(char *, size_t, char *);
262
263/* Defines for kdb_symbol_print */
264#define KDB_SP_SPACEB 0x0001 /* Space before string */
265#define KDB_SP_SPACEA 0x0002 /* Space after string */
266#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
267#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
268#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
269#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
270#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
271
272#define KDB_TSK(cpu) kgdb_info[cpu].task
273#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
274
275extern struct task_struct *kdb_curr_task(int);
276
277#define kdb_task_has_cpu(p) (task_curr(p))
278
279/* Simplify coexistence with NPTL */
280#define kdb_do_each_thread(g, p) do_each_thread(g, p)
281#define kdb_while_each_thread(g, p) while_each_thread(g, p)
282
283#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
284
285extern void *debug_kmalloc(size_t size, gfp_t flags);
286extern void debug_kfree(void *);
287extern void debug_kusage(void);
288
289extern void kdb_set_current_task(struct task_struct *);
290extern struct task_struct *kdb_current_task;
291#ifdef CONFIG_MODULES
292extern struct list_head *kdb_modules;
293#endif /* CONFIG_MODULES */
294
295extern char kdb_prompt_str[];
296
297#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
298
299#endif /* CONFIG_KGDB_KDB */
300#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
1/*
2 * Kernel Debugger Architecture Independent Support Functions
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
11 */
12
13#include <stdarg.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kallsyms.h>
18#include <linux/stddef.h>
19#include <linux/vmalloc.h>
20#include <linux/ptrace.h>
21#include <linux/module.h>
22#include <linux/highmem.h>
23#include <linux/hardirq.h>
24#include <linux/delay.h>
25#include <linux/uaccess.h>
26#include <linux/kdb.h>
27#include <linux/slab.h>
28#include "kdb_private.h"
29
30/*
31 * kdbgetsymval - Return the address of the given symbol.
32 *
33 * Parameters:
34 * symname Character string containing symbol name
35 * symtab Structure to receive results
36 * Returns:
37 * 0 Symbol not found, symtab zero filled
38 * 1 Symbol mapped to module/symbol/section, data in symtab
39 */
40int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
41{
42 if (KDB_DEBUG(AR))
43 kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
44 symtab);
45 memset(symtab, 0, sizeof(*symtab));
46 symtab->sym_start = kallsyms_lookup_name(symname);
47 if (symtab->sym_start) {
48 if (KDB_DEBUG(AR))
49 kdb_printf("kdbgetsymval: returns 1, "
50 "symtab->sym_start=0x%lx\n",
51 symtab->sym_start);
52 return 1;
53 }
54 if (KDB_DEBUG(AR))
55 kdb_printf("kdbgetsymval: returns 0\n");
56 return 0;
57}
58EXPORT_SYMBOL(kdbgetsymval);
59
60static char *kdb_name_table[100]; /* arbitrary size */
61
62/*
63 * kdbnearsym - Return the name of the symbol with the nearest address
64 * less than 'addr'.
65 *
66 * Parameters:
67 * addr Address to check for symbol near
68 * symtab Structure to receive results
69 * Returns:
70 * 0 No sections contain this address, symtab zero filled
71 * 1 Address mapped to module/symbol/section, data in symtab
72 * Remarks:
73 * 2.6 kallsyms has a "feature" where it unpacks the name into a
74 * string. If that string is reused before the caller expects it
75 * then the caller sees its string change without warning. To
76 * avoid cluttering up the main kdb code with lots of kdb_strdup,
77 * tests and kfree calls, kdbnearsym maintains an LRU list of the
78 * last few unique strings. The list is sized large enough to
79 * hold active strings, no kdb caller of kdbnearsym makes more
80 * than ~20 later calls before using a saved value.
81 */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{
84 int ret = 0;
85 unsigned long symbolsize;
86 unsigned long offset;
87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL;
89
90 if (KDB_DEBUG(AR))
91 kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
92 memset(symtab, 0, sizeof(*symtab));
93
94 if (addr < 4096)
95 goto out;
96 knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
97 if (!knt1) {
98 kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
99 addr);
100 goto out;
101 }
102 symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
103 (char **)(&symtab->mod_name), knt1);
104 if (offset > 8*1024*1024) {
105 symtab->sym_name = NULL;
106 addr = offset = symbolsize = 0;
107 }
108 symtab->sym_start = addr - offset;
109 symtab->sym_end = symtab->sym_start + symbolsize;
110 ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
111
112 if (ret) {
113 int i;
114 /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
115 * set but the buffer passed into kallsyms_lookup is not used,
116 * so it contains garbage. The caller has to work out which
117 * buffer needs to be saved.
118 *
119 * What was Rusty smoking when he wrote that code?
120 */
121 if (symtab->sym_name != knt1) {
122 strncpy(knt1, symtab->sym_name, knt1_size);
123 knt1[knt1_size-1] = '\0';
124 }
125 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
126 if (kdb_name_table[i] &&
127 strcmp(kdb_name_table[i], knt1) == 0)
128 break;
129 }
130 if (i >= ARRAY_SIZE(kdb_name_table)) {
131 debug_kfree(kdb_name_table[0]);
132 memcpy(kdb_name_table, kdb_name_table+1,
133 sizeof(kdb_name_table[0]) *
134 (ARRAY_SIZE(kdb_name_table)-1));
135 } else {
136 debug_kfree(knt1);
137 knt1 = kdb_name_table[i];
138 memcpy(kdb_name_table+i, kdb_name_table+i+1,
139 sizeof(kdb_name_table[0]) *
140 (ARRAY_SIZE(kdb_name_table)-i-1));
141 }
142 i = ARRAY_SIZE(kdb_name_table) - 1;
143 kdb_name_table[i] = knt1;
144 symtab->sym_name = kdb_name_table[i];
145 knt1 = NULL;
146 }
147
148 if (symtab->mod_name == NULL)
149 symtab->mod_name = "kernel";
150 if (KDB_DEBUG(AR))
151 kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
152 "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
153 symtab->sym_start, symtab->mod_name, symtab->sym_name,
154 symtab->sym_name);
155
156out:
157 debug_kfree(knt1);
158 return ret;
159}
160
161void kdbnearsym_cleanup(void)
162{
163 int i;
164 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
165 if (kdb_name_table[i]) {
166 debug_kfree(kdb_name_table[i]);
167 kdb_name_table[i] = NULL;
168 }
169 }
170}
171
172static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
173
174/*
175 * kallsyms_symbol_complete
176 *
177 * Parameters:
178 * prefix_name prefix of a symbol name to lookup
179 * max_len maximum length that can be returned
180 * Returns:
181 * Number of symbols which match the given prefix.
182 * Notes:
183 * prefix_name is changed to contain the longest unique prefix that
184 * starts with this prefix (tab completion).
185 */
186int kallsyms_symbol_complete(char *prefix_name, int max_len)
187{
188 loff_t pos = 0;
189 int prefix_len = strlen(prefix_name), prev_len = 0;
190 int i, number = 0;
191 const char *name;
192
193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name);
196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1,
199 strlen(ks_namebuf));
200 memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
201 ks_namebuf_prev[prev_len] = '\0';
202 continue;
203 }
204 for (i = 0; i < prev_len; i++) {
205 if (ks_namebuf[i] != ks_namebuf_prev[i]) {
206 prev_len = i;
207 ks_namebuf_prev[i] = '\0';
208 break;
209 }
210 }
211 }
212 }
213 if (prev_len > prefix_len)
214 memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
215 return number;
216}
217
218/*
219 * kallsyms_symbol_next
220 *
221 * Parameters:
222 * prefix_name prefix of a symbol name to lookup
223 * flag 0 means search from the head, 1 means continue search.
224 * Returns:
225 * 1 if a symbol matches the given prefix.
226 * 0 if no string found
227 */
228int kallsyms_symbol_next(char *prefix_name, int flag)
229{
230 int prefix_len = strlen(prefix_name);
231 static loff_t pos;
232 const char *name;
233
234 if (!flag)
235 pos = 0;
236
237 while ((name = kdb_walk_kallsyms(&pos))) {
238 if (strncmp(name, prefix_name, prefix_len) == 0) {
239 strncpy(prefix_name, name, strlen(name)+1);
240 return 1;
241 }
242 }
243 return 0;
244}
245
246/*
247 * kdb_symbol_print - Standard method for printing a symbol name and offset.
248 * Inputs:
249 * addr Address to be printed.
250 * symtab Address of symbol data, if NULL this routine does its
251 * own lookup.
252 * punc Punctuation for string, bit field.
253 * Remarks:
254 * The string and its punctuation is only printed if the address
255 * is inside the kernel, except that the value is always printed
256 * when requested.
257 */
258void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
259 unsigned int punc)
260{
261 kdb_symtab_t symtab, *symtab_p2;
262 if (symtab_p) {
263 symtab_p2 = (kdb_symtab_t *)symtab_p;
264 } else {
265 symtab_p2 = &symtab;
266 kdbnearsym(addr, symtab_p2);
267 }
268 if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
269 return;
270 if (punc & KDB_SP_SPACEB)
271 kdb_printf(" ");
272 if (punc & KDB_SP_VALUE)
273 kdb_printf(kdb_machreg_fmt0, addr);
274 if (symtab_p2->sym_name) {
275 if (punc & KDB_SP_VALUE)
276 kdb_printf(" ");
277 if (punc & KDB_SP_PAREN)
278 kdb_printf("(");
279 if (strcmp(symtab_p2->mod_name, "kernel"))
280 kdb_printf("[%s]", symtab_p2->mod_name);
281 kdb_printf("%s", symtab_p2->sym_name);
282 if (addr != symtab_p2->sym_start)
283 kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
284 if (punc & KDB_SP_SYMSIZE)
285 kdb_printf("/0x%lx",
286 symtab_p2->sym_end - symtab_p2->sym_start);
287 if (punc & KDB_SP_PAREN)
288 kdb_printf(")");
289 }
290 if (punc & KDB_SP_SPACEA)
291 kdb_printf(" ");
292 if (punc & KDB_SP_NEWLINE)
293 kdb_printf("\n");
294}
295
296/*
297 * kdb_strdup - kdb equivalent of strdup, for disasm code.
298 * Inputs:
299 * str The string to duplicate.
300 * type Flags to kmalloc for the new string.
301 * Returns:
302 * Address of the new string, NULL if storage could not be allocated.
303 * Remarks:
304 * This is not in lib/string.c because it uses kmalloc which is not
305 * available when string.o is used in boot loaders.
306 */
307char *kdb_strdup(const char *str, gfp_t type)
308{
309 int n = strlen(str)+1;
310 char *s = kmalloc(n, type);
311 if (!s)
312 return NULL;
313 return strcpy(s, str);
314}
315
316/*
317 * kdb_getarea_size - Read an area of data. The kdb equivalent of
318 * copy_from_user, with kdb messages for invalid addresses.
319 * Inputs:
320 * res Pointer to the area to receive the result.
321 * addr Address of the area to copy.
322 * size Size of the area.
323 * Returns:
324 * 0 for success, < 0 for error.
325 */
326int kdb_getarea_size(void *res, unsigned long addr, size_t size)
327{
328 int ret = probe_kernel_read((char *)res, (char *)addr, size);
329 if (ret) {
330 if (!KDB_STATE(SUPPRESS)) {
331 kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
332 KDB_STATE_SET(SUPPRESS);
333 }
334 ret = KDB_BADADDR;
335 } else {
336 KDB_STATE_CLEAR(SUPPRESS);
337 }
338 return ret;
339}
340
341/*
342 * kdb_putarea_size - Write an area of data. The kdb equivalent of
343 * copy_to_user, with kdb messages for invalid addresses.
344 * Inputs:
345 * addr Address of the area to write to.
346 * res Pointer to the area holding the data.
347 * size Size of the area.
348 * Returns:
349 * 0 for success, < 0 for error.
350 */
351int kdb_putarea_size(unsigned long addr, void *res, size_t size)
352{
353 int ret = probe_kernel_read((char *)addr, (char *)res, size);
354 if (ret) {
355 if (!KDB_STATE(SUPPRESS)) {
356 kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
357 KDB_STATE_SET(SUPPRESS);
358 }
359 ret = KDB_BADADDR;
360 } else {
361 KDB_STATE_CLEAR(SUPPRESS);
362 }
363 return ret;
364}
365
366/*
367 * kdb_getphys - Read data from a physical address. Validate the
368 * address is in range, use kmap_atomic() to get data
369 * similar to kdb_getarea() - but for phys addresses
370 * Inputs:
371 * res Pointer to the word to receive the result
372 * addr Physical address of the area to copy
373 * size Size of the area
374 * Returns:
375 * 0 for success, < 0 for error.
376 */
377static int kdb_getphys(void *res, unsigned long addr, size_t size)
378{
379 unsigned long pfn;
380 void *vaddr;
381 struct page *page;
382
383 pfn = (addr >> PAGE_SHIFT);
384 if (!pfn_valid(pfn))
385 return 1;
386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB);
390
391 return 0;
392}
393
394/*
395 * kdb_getphysword
396 * Inputs:
397 * word Pointer to the word to receive the result.
398 * addr Address of the area to copy.
399 * size Size of the area.
400 * Returns:
401 * 0 for success, < 0 for error.
402 */
403int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
404{
405 int diag;
406 __u8 w1;
407 __u16 w2;
408 __u32 w4;
409 __u64 w8;
410 *word = 0; /* Default value if addr or size is invalid */
411
412 switch (size) {
413 case 1:
414 diag = kdb_getphys(&w1, addr, sizeof(w1));
415 if (!diag)
416 *word = w1;
417 break;
418 case 2:
419 diag = kdb_getphys(&w2, addr, sizeof(w2));
420 if (!diag)
421 *word = w2;
422 break;
423 case 4:
424 diag = kdb_getphys(&w4, addr, sizeof(w4));
425 if (!diag)
426 *word = w4;
427 break;
428 case 8:
429 if (size <= sizeof(*word)) {
430 diag = kdb_getphys(&w8, addr, sizeof(w8));
431 if (!diag)
432 *word = w8;
433 break;
434 }
435 /* drop through */
436 default:
437 diag = KDB_BADWIDTH;
438 kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
439 }
440 return diag;
441}
442
443/*
444 * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
445 * data as numbers.
446 * Inputs:
447 * word Pointer to the word to receive the result.
448 * addr Address of the area to copy.
449 * size Size of the area.
450 * Returns:
451 * 0 for success, < 0 for error.
452 */
453int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
454{
455 int diag;
456 __u8 w1;
457 __u16 w2;
458 __u32 w4;
459 __u64 w8;
460 *word = 0; /* Default value if addr or size is invalid */
461 switch (size) {
462 case 1:
463 diag = kdb_getarea(w1, addr);
464 if (!diag)
465 *word = w1;
466 break;
467 case 2:
468 diag = kdb_getarea(w2, addr);
469 if (!diag)
470 *word = w2;
471 break;
472 case 4:
473 diag = kdb_getarea(w4, addr);
474 if (!diag)
475 *word = w4;
476 break;
477 case 8:
478 if (size <= sizeof(*word)) {
479 diag = kdb_getarea(w8, addr);
480 if (!diag)
481 *word = w8;
482 break;
483 }
484 /* drop through */
485 default:
486 diag = KDB_BADWIDTH;
487 kdb_printf("kdb_getword: bad width %ld\n", (long) size);
488 }
489 return diag;
490}
491
492/*
493 * kdb_putword - Write a binary value. Unlike kdb_putarea, this
494 * treats data as numbers.
495 * Inputs:
496 * addr Address of the area to write to..
497 * word The value to set.
498 * size Size of the area.
499 * Returns:
500 * 0 for success, < 0 for error.
501 */
502int kdb_putword(unsigned long addr, unsigned long word, size_t size)
503{
504 int diag;
505 __u8 w1;
506 __u16 w2;
507 __u32 w4;
508 __u64 w8;
509 switch (size) {
510 case 1:
511 w1 = word;
512 diag = kdb_putarea(addr, w1);
513 break;
514 case 2:
515 w2 = word;
516 diag = kdb_putarea(addr, w2);
517 break;
518 case 4:
519 w4 = word;
520 diag = kdb_putarea(addr, w4);
521 break;
522 case 8:
523 if (size <= sizeof(word)) {
524 w8 = word;
525 diag = kdb_putarea(addr, w8);
526 break;
527 }
528 /* drop through */
529 default:
530 diag = KDB_BADWIDTH;
531 kdb_printf("kdb_putword: bad width %ld\n", (long) size);
532 }
533 return diag;
534}
535
536/*
537 * kdb_task_state_string - Convert a string containing any of the
538 * letters DRSTCZEUIMA to a mask for the process state field and
539 * return the value. If no argument is supplied, return the mask
540 * that corresponds to environment variable PS, DRSTCZEU by
541 * default.
542 * Inputs:
543 * s String to convert
544 * Returns:
545 * Mask for process state.
546 * Notes:
547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
552 * the mask.
553 */
554
555/* unrunnable is < 0 */
556#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
557#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
558#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
559#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
560
561unsigned long kdb_task_state_string(const char *s)
562{
563 long res = 0;
564 if (!s) {
565 s = kdbgetenv("PS");
566 if (!s)
567 s = "DRSTCZEU"; /* default value for ps */
568 }
569 while (*s) {
570 switch (*s) {
571 case 'D':
572 res |= TASK_UNINTERRUPTIBLE;
573 break;
574 case 'R':
575 res |= RUNNING;
576 break;
577 case 'S':
578 res |= TASK_INTERRUPTIBLE;
579 break;
580 case 'T':
581 res |= TASK_STOPPED;
582 break;
583 case 'C':
584 res |= TASK_TRACED;
585 break;
586 case 'Z':
587 res |= EXIT_ZOMBIE << 16;
588 break;
589 case 'E':
590 res |= EXIT_DEAD << 16;
591 break;
592 case 'U':
593 res |= UNRUNNABLE;
594 break;
595 case 'I':
596 res |= IDLE;
597 break;
598 case 'M':
599 res |= DAEMON;
600 break;
601 case 'A':
602 res = ~0UL;
603 break;
604 default:
605 kdb_printf("%s: unknown flag '%c' ignored\n",
606 __func__, *s);
607 break;
608 }
609 ++s;
610 }
611 return res;
612}
613
614/*
615 * kdb_task_state_char - Return the character that represents the task state.
616 * Inputs:
617 * p struct task for the process
618 * Returns:
619 * One character to represent the task state.
620 */
621char kdb_task_state_char (const struct task_struct *p)
622{
623 int cpu;
624 char state;
625 unsigned long tmp;
626
627 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
628 return 'E';
629
630 cpu = kdb_process_cpu(p);
631 state = (p->state == 0) ? 'R' :
632 (p->state < 0) ? 'U' :
633 (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
634 (p->state & TASK_STOPPED) ? 'T' :
635 (p->state & TASK_TRACED) ? 'C' :
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
643 if (cpu != kdb_initial_cpu)
644 state = 'I'; /* idle task */
645 }
646 } else if (!p->mm && state == 'S') {
647 state = 'M'; /* sleeping system daemon */
648 }
649 return state;
650}
651
652/*
653 * kdb_task_state - Return true if a process has the desired state
654 * given by the mask.
655 * Inputs:
656 * p struct task for the process
657 * mask mask from kdb_task_state_string to select processes
658 * Returns:
659 * True if the process matches at least one criteria defined by the mask.
660 */
661unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
662{
663 char state[] = { kdb_task_state_char(p), '\0' };
664 return (mask & kdb_task_state_string(state)) != 0;
665}
666
667/*
668 * kdb_print_nameval - Print a name and its value, converting the
669 * value to a symbol lookup if possible.
670 * Inputs:
671 * name field name to print
672 * val value of field
673 */
674void kdb_print_nameval(const char *name, unsigned long val)
675{
676 kdb_symtab_t symtab;
677 kdb_printf(" %-11.11s ", name);
678 if (kdbnearsym(val, &symtab))
679 kdb_symbol_print(val, &symtab,
680 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
681 else
682 kdb_printf("0x%lx\n", val);
683}
684
685/* Last ditch allocator for debugging, so we can still debug even when
686 * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
687 * for space usage, not for speed. One smallish memory pool, the free
688 * chain is always in ascending address order to allow coalescing,
689 * allocations are done in brute force best fit.
690 */
691
692struct debug_alloc_header {
693 u32 next; /* offset of next header from start of pool */
694 u32 size;
695 void *caller;
696};
697
698/* The memory returned by this allocator must be aligned, which means
699 * so must the header size. Do not assume that sizeof(struct
700 * debug_alloc_header) is a multiple of the alignment, explicitly
701 * calculate the overhead of this header, including the alignment.
702 * The rest of this code must not use sizeof() on any header or
703 * pointer to a header.
704 */
705#define dah_align 8
706#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
707
708static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
709static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
710static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
711
712/* Locking is awkward. The debug code is called from all contexts,
713 * including non maskable interrupts. A normal spinlock is not safe
714 * in NMI context. Try to get the debug allocator lock, if it cannot
715 * be obtained after a second then give up. If the lock could not be
716 * previously obtained on this cpu then only try once.
717 *
718 * sparse has no annotation for "this function _sometimes_ acquires a
719 * lock", so fudge the acquire/release notation.
720 */
721static DEFINE_SPINLOCK(dap_lock);
722static int get_dap_lock(void)
723 __acquires(dap_lock)
724{
725 static int dap_locked = -1;
726 int count;
727 if (dap_locked == smp_processor_id())
728 count = 1;
729 else
730 count = 1000;
731 while (1) {
732 if (spin_trylock(&dap_lock)) {
733 dap_locked = -1;
734 return 1;
735 }
736 if (!count--)
737 break;
738 udelay(1000);
739 }
740 dap_locked = smp_processor_id();
741 __acquire(dap_lock);
742 return 0;
743}
744
745void *debug_kmalloc(size_t size, gfp_t flags)
746{
747 unsigned int rem, h_offset;
748 struct debug_alloc_header *best, *bestprev, *prev, *h;
749 void *p = NULL;
750 if (!get_dap_lock()) {
751 __release(dap_lock); /* we never actually got it */
752 return NULL;
753 }
754 h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
755 if (dah_first_call) {
756 h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
757 dah_first_call = 0;
758 }
759 size = ALIGN(size, dah_align);
760 prev = best = bestprev = NULL;
761 while (1) {
762 if (h->size >= size && (!best || h->size < best->size)) {
763 best = h;
764 bestprev = prev;
765 if (h->size == size)
766 break;
767 }
768 if (!h->next)
769 break;
770 prev = h;
771 h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
772 }
773 if (!best)
774 goto out;
775 rem = best->size - size;
776 /* The pool must always contain at least one header */
777 if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
778 goto out;
779 if (rem >= dah_overhead) {
780 best->size = size;
781 h_offset = ((char *)best - debug_alloc_pool) +
782 dah_overhead + best->size;
783 h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
784 h->size = rem - dah_overhead;
785 h->next = best->next;
786 } else
787 h_offset = best->next;
788 best->caller = __builtin_return_address(0);
789 dah_used += best->size;
790 dah_used_max = max(dah_used, dah_used_max);
791 if (bestprev)
792 bestprev->next = h_offset;
793 else
794 dah_first = h_offset;
795 p = (char *)best + dah_overhead;
796 memset(p, POISON_INUSE, best->size - 1);
797 *((char *)p + best->size - 1) = POISON_END;
798out:
799 spin_unlock(&dap_lock);
800 return p;
801}
802
803void debug_kfree(void *p)
804{
805 struct debug_alloc_header *h;
806 unsigned int h_offset;
807 if (!p)
808 return;
809 if ((char *)p < debug_alloc_pool ||
810 (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
811 kfree(p);
812 return;
813 }
814 if (!get_dap_lock()) {
815 __release(dap_lock); /* we never actually got it */
816 return; /* memory leak, cannot be helped */
817 }
818 h = (struct debug_alloc_header *)((char *)p - dah_overhead);
819 memset(p, POISON_FREE, h->size - 1);
820 *((char *)p + h->size - 1) = POISON_END;
821 h->caller = NULL;
822 dah_used -= h->size;
823 h_offset = (char *)h - debug_alloc_pool;
824 if (h_offset < dah_first) {
825 h->next = dah_first;
826 dah_first = h_offset;
827 } else {
828 struct debug_alloc_header *prev;
829 unsigned int prev_offset;
830 prev = (struct debug_alloc_header *)(debug_alloc_pool +
831 dah_first);
832 while (1) {
833 if (!prev->next || prev->next > h_offset)
834 break;
835 prev = (struct debug_alloc_header *)
836 (debug_alloc_pool + prev->next);
837 }
838 prev_offset = (char *)prev - debug_alloc_pool;
839 if (prev_offset + dah_overhead + prev->size == h_offset) {
840 prev->size += dah_overhead + h->size;
841 memset(h, POISON_FREE, dah_overhead - 1);
842 *((char *)h + dah_overhead - 1) = POISON_END;
843 h = prev;
844 h_offset = prev_offset;
845 } else {
846 h->next = prev->next;
847 prev->next = h_offset;
848 }
849 }
850 if (h_offset + dah_overhead + h->size == h->next) {
851 struct debug_alloc_header *next;
852 next = (struct debug_alloc_header *)
853 (debug_alloc_pool + h->next);
854 h->size += dah_overhead + next->size;
855 h->next = next->next;
856 memset(next, POISON_FREE, dah_overhead - 1);
857 *((char *)next + dah_overhead - 1) = POISON_END;
858 }
859 spin_unlock(&dap_lock);
860}
861
862void debug_kusage(void)
863{
864 struct debug_alloc_header *h_free, *h_used;
865#ifdef CONFIG_IA64
866 /* FIXME: using dah for ia64 unwind always results in a memory leak.
867 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
868 * all architectures.
869 */
870 static int debug_kusage_one_time;
871#else
872 static int debug_kusage_one_time = 1;
873#endif
874 if (!get_dap_lock()) {
875 __release(dap_lock); /* we never actually got it */
876 return;
877 }
878 h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
879 if (dah_first == 0 &&
880 (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
881 dah_first_call))
882 goto out;
883 if (!debug_kusage_one_time)
884 goto out;
885 debug_kusage_one_time = 0;
886 kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
887 __func__, dah_first);
888 if (dah_first) {
889 h_used = (struct debug_alloc_header *)debug_alloc_pool;
890 kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
891 h_used->size);
892 }
893 do {
894 h_used = (struct debug_alloc_header *)
895 ((char *)h_free + dah_overhead + h_free->size);
896 kdb_printf("%s: h_used %p size %d caller %p\n",
897 __func__, h_used, h_used->size, h_used->caller);
898 h_free = (struct debug_alloc_header *)
899 (debug_alloc_pool + h_free->next);
900 } while (h_free->next);
901 h_used = (struct debug_alloc_header *)
902 ((char *)h_free + dah_overhead + h_free->size);
903 if ((char *)h_used - debug_alloc_pool !=
904 sizeof(debug_alloc_pool_aligned))
905 kdb_printf("%s: h_used %p size %d caller %p\n",
906 __func__, h_used, h_used->size, h_used->caller);
907out:
908 spin_unlock(&dap_lock);
909}
910
911/* Maintain a small stack of kdb_flags to allow recursion without disturbing
912 * the global kdb state.
913 */
914
915static int kdb_flags_stack[4], kdb_flags_index;
916
917void kdb_save_flags(void)
918{
919 BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
920 kdb_flags_stack[kdb_flags_index++] = kdb_flags;
921}
922
923void kdb_restore_flags(void)
924{
925 BUG_ON(kdb_flags_index <= 0);
926 kdb_flags = kdb_flags_stack[--kdb_flags_index];
927}
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..7bfae887f211
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,590 @@
1/*
2 * early_res, could be used to replace bootmem
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/bootmem.h>
8#include <linux/mm.h>
9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
12
13/*
14 * Early reserved memory areas.
15 */
16/*
17 * need to make sure this one is bigger enough before
18 * find_fw_memmap_area could be used
19 */
20#define MAX_EARLY_RES_X 32
21
22struct early_res {
23 u64 start, end;
24 char name[15];
25 char overlap_ok;
26};
27static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
28
29static int max_early_res __initdata = MAX_EARLY_RES_X;
30static struct early_res *early_res __initdata = &early_res_x[0];
31static int early_res_count __initdata;
32
33static int __init find_overlapped_early(u64 start, u64 end)
34{
35 int i;
36 struct early_res *r;
37
38 for (i = 0; i < max_early_res && early_res[i].end; i++) {
39 r = &early_res[i];
40 if (end > r->start && start < r->end)
41 break;
42 }
43
44 return i;
45}
46
47/*
48 * Drop the i-th range from the early reservation map,
49 * by copying any higher ranges down one over it, and
50 * clearing what had been the last slot.
51 */
52static void __init drop_range(int i)
53{
54 int j;
55
56 for (j = i + 1; j < max_early_res && early_res[j].end; j++)
57 ;
58
59 memmove(&early_res[i], &early_res[i + 1],
60 (j - 1 - i) * sizeof(struct early_res));
61
62 early_res[j - 1].end = 0;
63 early_res_count--;
64}
65
66static void __init drop_range_partial(int i, u64 start, u64 end)
67{
68 u64 common_start, common_end;
69 u64 old_start, old_end;
70
71 old_start = early_res[i].start;
72 old_end = early_res[i].end;
73 common_start = max(old_start, start);
74 common_end = min(old_end, end);
75
76 /* no overlap ? */
77 if (common_start >= common_end)
78 return;
79
80 if (old_start < common_start) {
81 /* make head segment */
82 early_res[i].end = common_start;
83 if (old_end > common_end) {
84 char name[15];
85
86 /*
87 * Save a local copy of the name, since the
88 * early_res array could get resized inside
89 * reserve_early_without_check() ->
90 * __check_and_double_early_res(), which would
91 * make the current name pointer invalid.
92 */
93 strncpy(name, early_res[i].name,
94 sizeof(early_res[i].name) - 1);
95 /* add another for left over on tail */
96 reserve_early_without_check(common_end, old_end, name);
97 }
98 return;
99 } else {
100 if (old_end > common_end) {
101 /* reuse the entry for tail left */
102 early_res[i].start = common_end;
103 return;
104 }
105 /* all covered */
106 drop_range(i);
107 }
108}
109
110/*
111 * Split any existing ranges that:
112 * 1) are marked 'overlap_ok', and
113 * 2) overlap with the stated range [start, end)
114 * into whatever portion (if any) of the existing range is entirely
115 * below or entirely above the stated range. Drop the portion
116 * of the existing range that overlaps with the stated range,
117 * which will allow the caller of this routine to then add that
118 * stated range without conflicting with any existing range.
119 */
120static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
121{
122 int i;
123 struct early_res *r;
124 u64 lower_start, lower_end;
125 u64 upper_start, upper_end;
126 char name[15];
127
128 for (i = 0; i < max_early_res && early_res[i].end; i++) {
129 r = &early_res[i];
130
131 /* Continue past non-overlapping ranges */
132 if (end <= r->start || start >= r->end)
133 continue;
134
135 /*
136 * Leave non-ok overlaps as is; let caller
137 * panic "Overlapping early reservations"
138 * when it hits this overlap.
139 */
140 if (!r->overlap_ok)
141 return;
142
143 /*
144 * We have an ok overlap. We will drop it from the early
145 * reservation map, and add back in any non-overlapping
146 * portions (lower or upper) as separate, overlap_ok,
147 * non-overlapping ranges.
148 */
149
150 /* 1. Note any non-overlapping (lower or upper) ranges. */
151 strncpy(name, r->name, sizeof(name) - 1);
152
153 lower_start = lower_end = 0;
154 upper_start = upper_end = 0;
155 if (r->start < start) {
156 lower_start = r->start;
157 lower_end = start;
158 }
159 if (r->end > end) {
160 upper_start = end;
161 upper_end = r->end;
162 }
163
164 /* 2. Drop the original ok overlapping range */
165 drop_range(i);
166
167 i--; /* resume for-loop on copied down entry */
168
169 /* 3. Add back in any non-overlapping ranges. */
170 if (lower_end)
171 reserve_early_overlap_ok(lower_start, lower_end, name);
172 if (upper_end)
173 reserve_early_overlap_ok(upper_start, upper_end, name);
174 }
175}
176
177static void __init __reserve_early(u64 start, u64 end, char *name,
178 int overlap_ok)
179{
180 int i;
181 struct early_res *r;
182
183 i = find_overlapped_early(start, end);
184 if (i >= max_early_res)
185 panic("Too many early reservations");
186 r = &early_res[i];
187 if (r->end)
188 panic("Overlapping early reservations "
189 "%llx-%llx %s to %llx-%llx %s\n",
190 start, end - 1, name ? name : "", r->start,
191 r->end - 1, r->name);
192 r->start = start;
193 r->end = end;
194 r->overlap_ok = overlap_ok;
195 if (name)
196 strncpy(r->name, name, sizeof(r->name) - 1);
197 early_res_count++;
198}
199
200/*
201 * A few early reservtations come here.
202 *
203 * The 'overlap_ok' in the name of this routine does -not- mean it
204 * is ok for these reservations to overlap an earlier reservation.
205 * Rather it means that it is ok for subsequent reservations to
206 * overlap this one.
207 *
208 * Use this entry point to reserve early ranges when you are doing
209 * so out of "Paranoia", reserving perhaps more memory than you need,
210 * just in case, and don't mind a subsequent overlapping reservation
211 * that is known to be needed.
212 *
213 * The drop_overlaps_that_are_ok() call here isn't really needed.
214 * It would be needed if we had two colliding 'overlap_ok'
215 * reservations, so that the second such would not panic on the
216 * overlap with the first. We don't have any such as of this
217 * writing, but might as well tolerate such if it happens in
218 * the future.
219 */
220void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
221{
222 drop_overlaps_that_are_ok(start, end);
223 __reserve_early(start, end, name, 1);
224}
225
226static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
227{
228 u64 start, end, size, mem;
229 struct early_res *new;
230
231 /* do we have enough slots left ? */
232 if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
233 return;
234
235 /* double it */
236 mem = -1ULL;
237 size = sizeof(struct early_res) * max_early_res * 2;
238 if (early_res == early_res_x)
239 start = 0;
240 else
241 start = early_res[0].end;
242 end = ex_start;
243 if (start + size < end)
244 mem = find_fw_memmap_area(start, end, size,
245 sizeof(struct early_res));
246 if (mem == -1ULL) {
247 start = ex_end;
248 end = get_max_mapped();
249 if (start + size < end)
250 mem = find_fw_memmap_area(start, end, size,
251 sizeof(struct early_res));
252 }
253 if (mem == -1ULL)
254 panic("can not find more space for early_res array");
255
256 new = __va(mem);
257 /* save the first one for own */
258 new[0].start = mem;
259 new[0].end = mem + size;
260 new[0].overlap_ok = 0;
261 /* copy old to new */
262 if (early_res == early_res_x) {
263 memcpy(&new[1], &early_res[0],
264 sizeof(struct early_res) * max_early_res);
265 memset(&new[max_early_res+1], 0,
266 sizeof(struct early_res) * (max_early_res - 1));
267 early_res_count++;
268 } else {
269 memcpy(&new[1], &early_res[1],
270 sizeof(struct early_res) * (max_early_res - 1));
271 memset(&new[max_early_res], 0,
272 sizeof(struct early_res) * max_early_res);
273 }
274 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
275 early_res = new;
276 max_early_res *= 2;
277 printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
278 max_early_res, mem, mem + size - 1);
279}
280
281/*
282 * Most early reservations come here.
283 *
284 * We first have drop_overlaps_that_are_ok() drop any pre-existing
285 * 'overlap_ok' ranges, so that we can then reserve this memory
286 * range without risk of panic'ing on an overlapping overlap_ok
287 * early reservation.
288 */
289void __init reserve_early(u64 start, u64 end, char *name)
290{
291 if (start >= end)
292 return;
293
294 __check_and_double_early_res(start, end);
295
296 drop_overlaps_that_are_ok(start, end);
297 __reserve_early(start, end, name, 0);
298}
299
300void __init reserve_early_without_check(u64 start, u64 end, char *name)
301{
302 struct early_res *r;
303
304 if (start >= end)
305 return;
306
307 __check_and_double_early_res(start, end);
308
309 r = &early_res[early_res_count];
310
311 r->start = start;
312 r->end = end;
313 r->overlap_ok = 0;
314 if (name)
315 strncpy(r->name, name, sizeof(r->name) - 1);
316 early_res_count++;
317}
318
319void __init free_early(u64 start, u64 end)
320{
321 struct early_res *r;
322 int i;
323
324 kmemleak_free_part(__va(start), end - start);
325
326 i = find_overlapped_early(start, end);
327 r = &early_res[i];
328 if (i >= max_early_res || r->end != end || r->start != start)
329 panic("free_early on not reserved area: %llx-%llx!",
330 start, end - 1);
331
332 drop_range(i);
333}
334
335void __init free_early_partial(u64 start, u64 end)
336{
337 struct early_res *r;
338 int i;
339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
348try_next:
349 i = find_overlapped_early(start, end);
350 if (i >= max_early_res)
351 return;
352
353 r = &early_res[i];
354 /* hole ? */
355 if (r->end >= end && r->start <= start) {
356 drop_range_partial(i, start, end);
357 return;
358 }
359
360 drop_range_partial(i, start, end);
361 goto try_next;
362}
363
364#ifdef CONFIG_NO_BOOTMEM
365static void __init subtract_early_res(struct range *range, int az)
366{
367 int i, count;
368 u64 final_start, final_end;
369 int idx = 0;
370
371 count = 0;
372 for (i = 0; i < max_early_res && early_res[i].end; i++)
373 count++;
374
375 /* need to skip first one ?*/
376 if (early_res != early_res_x)
377 idx = 1;
378
379#define DEBUG_PRINT_EARLY_RES 1
380
381#if DEBUG_PRINT_EARLY_RES
382 printk(KERN_INFO "Subtract (%d early reservations)\n", count);
383#endif
384 for (i = idx; i < count; i++) {
385 struct early_res *r = &early_res[i];
386#if DEBUG_PRINT_EARLY_RES
387 printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i,
388 r->start, r->end, r->name);
389#endif
390 final_start = PFN_DOWN(r->start);
391 final_end = PFN_UP(r->end);
392 if (final_start >= final_end)
393 continue;
394 subtract_range(range, az, final_start, final_end);
395 }
396
397}
398
399int __init get_free_all_memory_range(struct range **rangep, int nodeid)
400{
401 int i, count;
402 u64 start = 0, end;
403 u64 size;
404 u64 mem;
405 struct range *range;
406 int nr_range;
407
408 count = 0;
409 for (i = 0; i < max_early_res && early_res[i].end; i++)
410 count++;
411
412 count *= 2;
413
414 size = sizeof(struct range) * count;
415 end = get_max_mapped();
416#ifdef MAX_DMA32_PFN
417 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
418 start = MAX_DMA32_PFN << PAGE_SHIFT;
419#endif
420 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
421 if (mem == -1ULL)
422 panic("can not find more space for range free");
423
424 range = __va(mem);
425 /* use early_node_map[] and early_res to get range array at first */
426 memset(range, 0, size);
427 nr_range = 0;
428
429 /* need to go over early_node_map to find out good range for node */
430 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
431#ifdef CONFIG_X86_32
432 subtract_range(range, count, max_low_pfn, -1ULL);
433#endif
434 subtract_early_res(range, count);
435 nr_range = clean_sort_range(range, count);
436
437 /* need to clear it ? */
438 if (nodeid == MAX_NUMNODES) {
439 memset(&early_res[0], 0,
440 sizeof(struct early_res) * max_early_res);
441 early_res = NULL;
442 max_early_res = 0;
443 }
444
445 *rangep = range;
446 return nr_range;
447}
448#else
449void __init early_res_to_bootmem(u64 start, u64 end)
450{
451 int i, count;
452 u64 final_start, final_end;
453 int idx = 0;
454
455 count = 0;
456 for (i = 0; i < max_early_res && early_res[i].end; i++)
457 count++;
458
459 /* need to skip first one ?*/
460 if (early_res != early_res_x)
461 idx = 1;
462
463 printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
464 count - idx, max_early_res, start, end);
465 for (i = idx; i < count; i++) {
466 struct early_res *r = &early_res[i];
467 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
468 r->start, r->end, r->name);
469 final_start = max(start, r->start);
470 final_end = min(end, r->end);
471 if (final_start >= final_end) {
472 printk(KERN_CONT "\n");
473 continue;
474 }
475 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
476 final_start, final_end);
477 reserve_bootmem_generic(final_start, final_end - final_start,
478 BOOTMEM_DEFAULT);
479 }
480 /* clear them */
481 memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
482 early_res = NULL;
483 max_early_res = 0;
484 early_res_count = 0;
485}
486#endif
487
488/* Check for already reserved areas */
489static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
490{
491 int i;
492 u64 addr = *addrp;
493 int changed = 0;
494 struct early_res *r;
495again:
496 i = find_overlapped_early(addr, addr + size);
497 r = &early_res[i];
498 if (i < max_early_res && r->end) {
499 *addrp = addr = round_up(r->end, align);
500 changed = 1;
501 goto again;
502 }
503 return changed;
504}
505
506/* Check for already reserved areas */
507static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
508{
509 int i;
510 u64 addr = *addrp, last;
511 u64 size = *sizep;
512 int changed = 0;
513again:
514 last = addr + size;
515 for (i = 0; i < max_early_res && early_res[i].end; i++) {
516 struct early_res *r = &early_res[i];
517 if (last > r->start && addr < r->start) {
518 size = r->start - addr;
519 changed = 1;
520 goto again;
521 }
522 if (last > r->end && addr < r->end) {
523 addr = round_up(r->end, align);
524 size = last - addr;
525 changed = 1;
526 goto again;
527 }
528 if (last <= r->end && addr >= r->start) {
529 (*sizep)++;
530 return 0;
531 }
532 }
533 if (changed) {
534 *addrp = addr;
535 *sizep = size;
536 }
537 return changed;
538}
539
540/*
541 * Find a free area with specified alignment in a specific range.
542 * only with the area.between start to end is active range from early_node_map
543 * so they are good as RAM
544 */
545u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
546 u64 size, u64 align)
547{
548 u64 addr, last;
549
550 addr = round_up(ei_start, align);
551 if (addr < start)
552 addr = round_up(start, align);
553 if (addr >= ei_last)
554 goto out;
555 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
556 ;
557 last = addr + size;
558 if (last > ei_last)
559 goto out;
560 if (last > end)
561 goto out;
562
563 return addr;
564
565out:
566 return -1ULL;
567}
568
569u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
570 u64 *sizep, u64 align)
571{
572 u64 addr, last;
573
574 addr = round_up(ei_start, align);
575 if (addr < start)
576 addr = round_up(start, align);
577 if (addr >= ei_last)
578 goto out;
579 *sizep = ei_last - addr;
580 while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
581 ;
582 last = addr + *sizep;
583 if (last > ei_last)
584 goto out;
585
586 return addr;
587
588out:
589 return -1ULL;
590}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
1#include <linux/elf.h>
2#include <linux/fs.h>
3#include <linux/mm.h>
4
5#include <asm/elf.h>
6
7
8Elf_Half __weak elf_core_extra_phdrs(void)
9{
10 return 0;
11}
12
13int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
14 unsigned long limit)
15{
16 return 1;
17}
18
19int __weak elf_core_write_extra_data(struct file *file, size_t *size,
20 unsigned long limit)
21{
22 return 1;
23}
24
25size_t __weak elf_core_extra_data_size(void)
26{
27 return 0;
28}
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock); 27static DEFINE_RWLOCK(exec_domains_lock);
28 28
29 29
30static u_long ident_map[32] = { 30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7, 31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15, 32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23, 33 16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
56} 56}
57 57
58static struct exec_domain * 58static struct exec_domain *
59lookup_exec_domain(u_long personality) 59lookup_exec_domain(unsigned int personality)
60{ 60{
61 struct exec_domain * ep; 61 unsigned int pers = personality(personality);
62 u_long pers = personality(personality); 62 struct exec_domain *ep;
63 63
64 read_lock(&exec_domains_lock); 64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) { 65 for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
70 70
71#ifdef CONFIG_MODULES 71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock); 72 read_unlock(&exec_domains_lock);
73 request_module("personality-%ld", pers); 73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock); 74 read_lock(&exec_domains_lock);
75 75
76 for (ep = exec_domains; ep; ep = ep->next) { 76 for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
135} 135}
136 136
137int 137int
138__set_personality(u_long personality) 138__set_personality(unsigned int personality)
139{ 139{
140 struct exec_domain *ep, *oep; 140 struct exec_domain *ep, *oep;
141 141
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
188module_init(proc_execdomains_init); 188module_init(proc_execdomains_init);
189#endif 189#endif
190 190
191SYSCALL_DEFINE1(personality, u_long, personality) 191SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 192{
193 u_long old = current->personality; 193 unsigned int old = current->personality;
194 194
195 if (personality != 0xffffffff) { 195 if (personality != 0xffffffff) {
196 set_personality(personality); 196 set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
198 return -EINVAL; 198 return -EINVAL;
199 } 199 }
200 200
201 return (long)old; 201 return old;
202} 202}
203 203
204 204
diff --git a/kernel/exit.c b/kernel/exit.c
index 546774a31a66..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,15 +55,14 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
62static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p, bool group_dead)
63{ 62{
64 nr_threads--; 63 nr_threads--;
65 detach_pid(p, PIDTYPE_PID); 64 detach_pid(p, PIDTYPE_PID);
66 if (thread_group_leader(p)) { 65 if (group_dead) {
67 detach_pid(p, PIDTYPE_PGID); 66 detach_pid(p, PIDTYPE_PGID);
68 detach_pid(p, PIDTYPE_SID); 67 detach_pid(p, PIDTYPE_SID);
69 68
@@ -80,23 +79,26 @@ static void __unhash_process(struct task_struct *p)
80static void __exit_signal(struct task_struct *tsk) 79static void __exit_signal(struct task_struct *tsk)
81{ 80{
82 struct signal_struct *sig = tsk->signal; 81 struct signal_struct *sig = tsk->signal;
82 bool group_dead = thread_group_leader(tsk);
83 struct sighand_struct *sighand; 83 struct sighand_struct *sighand;
84 struct tty_struct *uninitialized_var(tty);
84 85
85 BUG_ON(!sig); 86 sighand = rcu_dereference_check(tsk->sighand,
86 BUG_ON(!atomic_read(&sig->count)); 87 rcu_read_lock_held() ||
87 88 lockdep_tasklist_lock_is_held());
88 sighand = rcu_dereference(tsk->sighand);
89 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
90 90
91 posix_cpu_timers_exit(tsk); 91 posix_cpu_timers_exit(tsk);
92 if (atomic_dec_and_test(&sig->count)) 92 if (group_dead) {
93 posix_cpu_timers_exit_group(tsk); 93 posix_cpu_timers_exit_group(tsk);
94 else { 94 tty = sig->tty;
95 sig->tty = NULL;
96 } else {
95 /* 97 /*
96 * If there is any task waiting for the group exit 98 * If there is any task waiting for the group exit
97 * then notify it: 99 * then notify it:
98 */ 100 */
99 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) 101 if (sig->notify_count > 0 && !--sig->notify_count)
100 wake_up_process(sig->group_exit_task); 102 wake_up_process(sig->group_exit_task);
101 103
102 if (tsk == sig->curr_target) 104 if (tsk == sig->curr_target)
@@ -122,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->oublock += task_io_get_oublock(tsk); 124 sig->oublock += task_io_get_oublock(tsk);
123 task_io_accounting_add(&sig->ioac, &tsk->ioac); 125 task_io_accounting_add(&sig->ioac, &tsk->ioac);
124 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
125 sig = NULL; /* Marker for below. */
126 } 127 }
127 128
128 __unhash_process(tsk); 129 sig->nr_threads--;
130 __unhash_process(tsk, group_dead);
129 131
130 /* 132 /*
131 * Do this under ->siglock, we can race with another thread 133 * Do this under ->siglock, we can race with another thread
132 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
133 */ 135 */
134 flush_sigqueue(&tsk->pending); 136 flush_sigqueue(&tsk->pending);
135
136 tsk->signal = NULL;
137 tsk->sighand = NULL; 137 tsk->sighand = NULL;
138 spin_unlock(&sighand->siglock); 138 spin_unlock(&sighand->siglock);
139 139
140 __cleanup_sighand(sighand); 140 __cleanup_sighand(sighand);
141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
142 if (sig) { 142 if (group_dead) {
143 flush_sigqueue(&sig->shared_pending); 143 flush_sigqueue(&sig->shared_pending);
144 taskstats_tgid_free(sig); 144 tty_kref_put(tty);
145 /*
146 * Make sure ->signal can't go away under rq->lock,
147 * see account_group_exec_runtime().
148 */
149 task_rq_unlock_wait(tsk);
150 __cleanup_signal(sig);
151 } 145 }
152} 146}
153 147
@@ -170,8 +164,10 @@ void release_task(struct task_struct * p)
170repeat: 164repeat:
171 tracehook_prepare_release_task(p); 165 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 166 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 167 * can't be modifying its own credentials. But shut RCU-lockdep up */
168 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 169 atomic_dec(&__task_cred(p)->user->processes);
170 rcu_read_unlock();
175 171
176 proc_flush_task(p); 172 proc_flush_task(p);
177 173
@@ -473,9 +469,11 @@ static void close_files(struct files_struct * files)
473 /* 469 /*
474 * It is safe to dereference the fd table without RCU or 470 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 471 * ->file_lock because this is the last reference to the
476 * files structure. 472 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 473 */
474 rcu_read_lock();
478 fdt = files_fdtable(files); 475 fdt = files_fdtable(files);
476 rcu_read_unlock();
479 for (;;) { 477 for (;;) {
480 unsigned long set; 478 unsigned long set;
481 i = j * __NFDBITS; 479 i = j * __NFDBITS;
@@ -521,10 +519,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 519 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 520 * you can free files immediately.
523 */ 521 */
522 rcu_read_lock();
524 fdt = files_fdtable(files); 523 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 524 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 525 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 526 free_fdtable(fdt);
527 rcu_read_unlock();
528 } 528 }
529} 529}
530 530
@@ -849,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
849 849
850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
851 851
852 /* mt-exec, de_thread() is waiting for us */ 852 /* mt-exec, de_thread() is waiting for group leader */
853 if (thread_group_leader(tsk) && 853 if (unlikely(tsk->signal->notify_count < 0))
854 tsk->signal->group_exit_task &&
855 tsk->signal->notify_count < 0)
856 wake_up_process(tsk->signal->group_exit_task); 854 wake_up_process(tsk->signal->group_exit_task);
857
858 write_unlock_irq(&tasklist_lock); 855 write_unlock_irq(&tasklist_lock);
859 856
860 tracehook_report_death(tsk, signal, cookie, group_dead); 857 tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -944,7 +941,9 @@ NORET_TYPE void do_exit(long code)
944 preempt_count()); 941 preempt_count());
945 942
946 acct_update_integrals(tsk); 943 acct_update_integrals(tsk);
947 944 /* sync mm's RSS info before statistics gathering */
945 if (tsk->mm)
946 sync_mm_rss(tsk, tsk->mm);
948 group_dead = atomic_dec_and_test(&tsk->signal->live); 947 group_dead = atomic_dec_and_test(&tsk->signal->live);
949 if (group_dead) { 948 if (group_dead) {
950 hrtimer_cancel(&tsk->signal->real_timer); 949 hrtimer_cancel(&tsk->signal->real_timer);
@@ -993,8 +992,10 @@ NORET_TYPE void do_exit(long code)
993 992
994 exit_notify(tsk, group_dead); 993 exit_notify(tsk, group_dead);
995#ifdef CONFIG_NUMA 994#ifdef CONFIG_NUMA
995 task_lock(tsk);
996 mpol_put(tsk->mempolicy); 996 mpol_put(tsk->mempolicy);
997 tsk->mempolicy = NULL; 997 tsk->mempolicy = NULL;
998 task_unlock(tsk);
998#endif 999#endif
999#ifdef CONFIG_FUTEX 1000#ifdef CONFIG_FUTEX
1000 if (unlikely(current->pi_state_cache)) 1001 if (unlikely(current->pi_state_cache))
@@ -1180,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1180 1181
1181 if (unlikely(wo->wo_flags & WNOWAIT)) { 1182 if (unlikely(wo->wo_flags & WNOWAIT)) {
1182 int exit_code = p->exit_code; 1183 int exit_code = p->exit_code;
1183 int why, status; 1184 int why;
1184 1185
1185 get_task_struct(p); 1186 get_task_struct(p);
1186 read_unlock(&tasklist_lock); 1187 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,14 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89 89
90#ifdef CONFIG_PROVE_RCU
91int lockdep_tasklist_lock_is_held(void)
92{
93 return lockdep_is_held(&tasklist_lock);
94}
95EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
96#endif /* #ifdef CONFIG_PROVE_RCU */
97
90int nr_processes(void) 98int nr_processes(void)
91{ 99{
92 int cpu; 100 int cpu;
@@ -157,6 +165,18 @@ void free_task(struct task_struct *tsk)
157} 165}
158EXPORT_SYMBOL(free_task); 166EXPORT_SYMBOL(free_task);
159 167
168static inline void free_signal_struct(struct signal_struct *sig)
169{
170 taskstats_tgid_free(sig);
171 kmem_cache_free(signal_cachep, sig);
172}
173
174static inline void put_signal_struct(struct signal_struct *sig)
175{
176 if (atomic_dec_and_test(&sig->sigcnt))
177 free_signal_struct(sig);
178}
179
160void __put_task_struct(struct task_struct *tsk) 180void __put_task_struct(struct task_struct *tsk)
161{ 181{
162 WARN_ON(!tsk->exit_state); 182 WARN_ON(!tsk->exit_state);
@@ -165,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
165 185
166 exit_creds(tsk); 186 exit_creds(tsk);
167 delayacct_tsk_free(tsk); 187 delayacct_tsk_free(tsk);
188 put_signal_struct(tsk->signal);
168 189
169 if (!profile_handoff_task(tsk)) 190 if (!profile_handoff_task(tsk))
170 free_task(tsk); 191 free_task(tsk);
@@ -328,15 +349,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
328 if (!tmp) 349 if (!tmp)
329 goto fail_nomem; 350 goto fail_nomem;
330 *tmp = *mpnt; 351 *tmp = *mpnt;
352 INIT_LIST_HEAD(&tmp->anon_vma_chain);
331 pol = mpol_dup(vma_policy(mpnt)); 353 pol = mpol_dup(vma_policy(mpnt));
332 retval = PTR_ERR(pol); 354 retval = PTR_ERR(pol);
333 if (IS_ERR(pol)) 355 if (IS_ERR(pol))
334 goto fail_nomem_policy; 356 goto fail_nomem_policy;
335 vma_set_policy(tmp, pol); 357 vma_set_policy(tmp, pol);
358 if (anon_vma_fork(tmp, mpnt))
359 goto fail_nomem_anon_vma_fork;
336 tmp->vm_flags &= ~VM_LOCKED; 360 tmp->vm_flags &= ~VM_LOCKED;
337 tmp->vm_mm = mm; 361 tmp->vm_mm = mm;
338 tmp->vm_next = NULL; 362 tmp->vm_next = NULL;
339 anon_vma_link(tmp);
340 file = tmp->vm_file; 363 file = tmp->vm_file;
341 if (file) { 364 if (file) {
342 struct inode *inode = file->f_path.dentry->d_inode; 365 struct inode *inode = file->f_path.dentry->d_inode;
@@ -391,6 +414,8 @@ out:
391 flush_tlb_mm(oldmm); 414 flush_tlb_mm(oldmm);
392 up_write(&oldmm->mmap_sem); 415 up_write(&oldmm->mmap_sem);
393 return retval; 416 return retval;
417fail_nomem_anon_vma_fork:
418 mpol_put(pol);
394fail_nomem_policy: 419fail_nomem_policy:
395 kmem_cache_free(vm_area_cachep, tmp); 420 kmem_cache_free(vm_area_cachep, tmp);
396fail_nomem: 421fail_nomem:
@@ -454,8 +479,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
454 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; 479 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
455 mm->core_state = NULL; 480 mm->core_state = NULL;
456 mm->nr_ptes = 0; 481 mm->nr_ptes = 0;
457 set_mm_counter(mm, file_rss, 0); 482 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
458 set_mm_counter(mm, anon_rss, 0);
459 spin_lock_init(&mm->page_table_lock); 483 spin_lock_init(&mm->page_table_lock);
460 mm->free_area_cache = TASK_UNMAPPED_BASE; 484 mm->free_area_cache = TASK_UNMAPPED_BASE;
461 mm->cached_hole_size = ~0UL; 485 mm->cached_hole_size = ~0UL;
@@ -824,23 +848,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
824 */ 848 */
825static void posix_cpu_timers_init_group(struct signal_struct *sig) 849static void posix_cpu_timers_init_group(struct signal_struct *sig)
826{ 850{
851 unsigned long cpu_limit;
852
827 /* Thread group counters. */ 853 /* Thread group counters. */
828 thread_group_cputime_init(sig); 854 thread_group_cputime_init(sig);
829 855
830 /* Expiration times and increments. */ 856 cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
831 sig->it[CPUCLOCK_PROF].expires = cputime_zero; 857 if (cpu_limit != RLIM_INFINITY) {
832 sig->it[CPUCLOCK_PROF].incr = cputime_zero; 858 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
833 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
834 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
835
836 /* Cached expiration times. */
837 sig->cputime_expires.prof_exp = cputime_zero;
838 sig->cputime_expires.virt_exp = cputime_zero;
839 sig->cputime_expires.sched_exp = 0;
840
841 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
842 sig->cputime_expires.prof_exp =
843 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
844 sig->cputimer.running = 1; 859 sig->cputimer.running = 1;
845 } 860 }
846 861
@@ -857,54 +872,30 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
857 if (clone_flags & CLONE_THREAD) 872 if (clone_flags & CLONE_THREAD)
858 return 0; 873 return 0;
859 874
860 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 875 sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
861 tsk->signal = sig; 876 tsk->signal = sig;
862 if (!sig) 877 if (!sig)
863 return -ENOMEM; 878 return -ENOMEM;
864 879
865 atomic_set(&sig->count, 1); 880 sig->nr_threads = 1;
866 atomic_set(&sig->live, 1); 881 atomic_set(&sig->live, 1);
882 atomic_set(&sig->sigcnt, 1);
867 init_waitqueue_head(&sig->wait_chldexit); 883 init_waitqueue_head(&sig->wait_chldexit);
868 sig->flags = 0;
869 if (clone_flags & CLONE_NEWPID) 884 if (clone_flags & CLONE_NEWPID)
870 sig->flags |= SIGNAL_UNKILLABLE; 885 sig->flags |= SIGNAL_UNKILLABLE;
871 sig->group_exit_code = 0;
872 sig->group_exit_task = NULL;
873 sig->group_stop_count = 0;
874 sig->curr_target = tsk; 886 sig->curr_target = tsk;
875 init_sigpending(&sig->shared_pending); 887 init_sigpending(&sig->shared_pending);
876 INIT_LIST_HEAD(&sig->posix_timers); 888 INIT_LIST_HEAD(&sig->posix_timers);
877 889
878 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 890 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
879 sig->it_real_incr.tv64 = 0;
880 sig->real_timer.function = it_real_fn; 891 sig->real_timer.function = it_real_fn;
881 892
882 sig->leader = 0; /* session leadership doesn't inherit */
883 sig->tty_old_pgrp = NULL;
884 sig->tty = NULL;
885
886 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
887 sig->gtime = cputime_zero;
888 sig->cgtime = cputime_zero;
889#ifndef CONFIG_VIRT_CPU_ACCOUNTING
890 sig->prev_utime = sig->prev_stime = cputime_zero;
891#endif
892 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
893 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
894 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
895 sig->maxrss = sig->cmaxrss = 0;
896 task_io_accounting_init(&sig->ioac);
897 sig->sum_sched_runtime = 0;
898 taskstats_tgid_init(sig);
899
900 task_lock(current->group_leader); 893 task_lock(current->group_leader);
901 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 894 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
902 task_unlock(current->group_leader); 895 task_unlock(current->group_leader);
903 896
904 posix_cpu_timers_init_group(sig); 897 posix_cpu_timers_init_group(sig);
905 898
906 acct_init_pacct(&sig->pacct);
907
908 tty_audit_fork(sig); 899 tty_audit_fork(sig);
909 900
910 sig->oom_adj = current->signal->oom_adj; 901 sig->oom_adj = current->signal->oom_adj;
@@ -912,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
912 return 0; 903 return 0;
913} 904}
914 905
915void __cleanup_signal(struct signal_struct *sig)
916{
917 thread_group_cputime_free(sig);
918 tty_kref_put(sig->tty);
919 kmem_cache_free(signal_cachep, sig);
920}
921
922static void copy_flags(unsigned long clone_flags, struct task_struct *p) 906static void copy_flags(unsigned long clone_flags, struct task_struct *p)
923{ 907{
924 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
@@ -1033,7 +1017,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1033#endif 1017#endif
1034 retval = -EAGAIN; 1018 retval = -EAGAIN;
1035 if (atomic_read(&p->real_cred->user->processes) >= 1019 if (atomic_read(&p->real_cred->user->processes) >=
1036 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 1020 task_rlimit(p, RLIMIT_NPROC)) {
1037 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 1021 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
1038 p->real_cred->user != INIT_USER) 1022 p->real_cred->user != INIT_USER)
1039 goto bad_fork_free; 1023 goto bad_fork_free;
@@ -1075,6 +1059,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1075 p->prev_utime = cputime_zero; 1059 p->prev_utime = cputime_zero;
1076 p->prev_stime = cputime_zero; 1060 p->prev_stime = cputime_zero;
1077#endif 1061#endif
1062#if defined(SPLIT_RSS_COUNTING)
1063 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1064#endif
1078 1065
1079 p->default_timer_slack_ns = current->timer_slack_ns; 1066 p->default_timer_slack_ns = current->timer_slack_ns;
1080 1067
@@ -1132,10 +1119,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 p->memcg_batch.memcg = NULL; 1119 p->memcg_batch.memcg = NULL;
1133#endif 1120#endif
1134 1121
1135 p->bts = NULL;
1136
1137 p->stack_start = stack_start;
1138
1139 /* Perform scheduler related setup. Assign this task to a CPU. */ 1122 /* Perform scheduler related setup. Assign this task to a CPU. */
1140 sched_fork(p, clone_flags); 1123 sched_fork(p, clone_flags);
1141 1124
@@ -1241,21 +1224,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1241 /* Need tasklist lock for parent etc handling! */ 1224 /* Need tasklist lock for parent etc handling! */
1242 write_lock_irq(&tasklist_lock); 1225 write_lock_irq(&tasklist_lock);
1243 1226
1244 /*
1245 * The task hasn't been attached yet, so its cpus_allowed mask will
1246 * not be changed, nor will its assigned CPU.
1247 *
1248 * The cpus_allowed mask of the parent may have changed after it was
1249 * copied first time - so re-copy it here, then check the child's CPU
1250 * to ensure it is on a valid CPU (and if not, just force it back to
1251 * parent's CPU). This avoids alot of nasty races.
1252 */
1253 p->cpus_allowed = current->cpus_allowed;
1254 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1255 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1256 !cpu_online(task_cpu(p))))
1257 set_task_cpu(p, smp_processor_id());
1258
1259 /* CLONE_PARENT re-uses the old parent */ 1227 /* CLONE_PARENT re-uses the old parent */
1260 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1228 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1261 p->real_parent = current->real_parent; 1229 p->real_parent = current->real_parent;
@@ -1284,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1284 } 1252 }
1285 1253
1286 if (clone_flags & CLONE_THREAD) { 1254 if (clone_flags & CLONE_THREAD) {
1287 atomic_inc(&current->signal->count); 1255 current->signal->nr_threads++;
1288 atomic_inc(&current->signal->live); 1256 atomic_inc(&current->signal->live);
1257 atomic_inc(&current->signal->sigcnt);
1289 p->group_leader = current->group_leader; 1258 p->group_leader = current->group_leader;
1290 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1259 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1291 } 1260 }
@@ -1298,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1298 p->nsproxy->pid_ns->child_reaper = p; 1267 p->nsproxy->pid_ns->child_reaper = p;
1299 1268
1300 p->signal->leader_pid = pid; 1269 p->signal->leader_pid = pid;
1301 tty_kref_put(p->signal->tty);
1302 p->signal->tty = tty_kref_get(current->signal->tty); 1270 p->signal->tty = tty_kref_get(current->signal->tty);
1303 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1271 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1304 attach_pid(p, PIDTYPE_SID, task_session(current)); 1272 attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1331,7 +1299,7 @@ bad_fork_cleanup_mm:
1331 mmput(p->mm); 1299 mmput(p->mm);
1332bad_fork_cleanup_signal: 1300bad_fork_cleanup_signal:
1333 if (!(clone_flags & CLONE_THREAD)) 1301 if (!(clone_flags & CLONE_THREAD))
1334 __cleanup_signal(p->signal); 1302 free_signal_struct(p->signal);
1335bad_fork_cleanup_sighand: 1303bad_fork_cleanup_sighand:
1336 __cleanup_sighand(p->sighand); 1304 __cleanup_sighand(p->sighand);
1337bad_fork_cleanup_fs: 1305bad_fork_cleanup_fs:
@@ -1366,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
1366 return regs; 1334 return regs;
1367} 1335}
1368 1336
1337static inline void init_idle_pids(struct pid_link *links)
1338{
1339 enum pid_type type;
1340
1341 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1342 INIT_HLIST_NODE(&links[type].node); /* not really needed */
1343 links[type].pid = &init_struct_pid;
1344 }
1345}
1346
1369struct task_struct * __cpuinit fork_idle(int cpu) 1347struct task_struct * __cpuinit fork_idle(int cpu)
1370{ 1348{
1371 struct task_struct *task; 1349 struct task_struct *task;
@@ -1373,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1373 1351
1374 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1352 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1375 &init_struct_pid, 0); 1353 &init_struct_pid, 0);
1376 if (!IS_ERR(task)) 1354 if (!IS_ERR(task)) {
1355 init_idle_pids(task->pids);
1377 init_idle(task, cpu); 1356 init_idle(task, cpu);
1357 }
1378 1358
1379 return task; 1359 return task;
1380} 1360}
@@ -1546,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
1546 *flags_ptr |= CLONE_SIGHAND; 1526 *flags_ptr |= CLONE_SIGHAND;
1547 1527
1548 /* 1528 /*
1549 * If unsharing signal handlers and the task was created
1550 * using CLONE_THREAD, then must unshare the thread
1551 */
1552 if ((*flags_ptr & CLONE_SIGHAND) &&
1553 (atomic_read(&current->signal->count) > 1))
1554 *flags_ptr |= CLONE_THREAD;
1555
1556 /*
1557 * If unsharing namespace, must also unshare filesystem information. 1529 * If unsharing namespace, must also unshare filesystem information.
1558 */ 1530 */
1559 if (*flags_ptr & CLONE_NEWNS) 1531 if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
429static struct task_struct * futex_find_get_task(pid_t pid) 429static struct task_struct * futex_find_get_task(pid_t pid)
430{ 430{
431 struct task_struct *p; 431 struct task_struct *p;
432 const struct cred *cred = current_cred(), *pcred;
433 432
434 rcu_read_lock(); 433 rcu_read_lock();
435 p = find_task_by_vpid(pid); 434 p = find_task_by_vpid(pid);
436 if (!p) { 435 if (p)
437 p = ERR_PTR(-ESRCH); 436 get_task_struct(p);
438 } else {
439 pcred = __task_cred(p);
440 if (cred->euid != pcred->euid &&
441 cred->euid != pcred->uid)
442 p = ERR_PTR(-ESRCH);
443 else
444 get_task_struct(p);
445 }
446 437
447 rcu_read_unlock(); 438 rcu_read_unlock();
448 439
@@ -530,8 +521,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
530 return -EINVAL; 521 return -EINVAL;
531 522
532 WARN_ON(!atomic_read(&pi_state->refcount)); 523 WARN_ON(!atomic_read(&pi_state->refcount));
533 WARN_ON(pid && pi_state->owner && 524
534 pi_state->owner->pid != pid); 525 /*
526 * When pi_state->owner is NULL then the owner died
527 * and another waiter is on the fly. pi_state->owner
528 * is fixed up by the task which acquires
529 * pi_state->rt_mutex.
530 *
531 * We do not check for pid == 0 which can happen when
532 * the owner died and robust_list_exit() cleared the
533 * TID.
534 */
535 if (pid && pi_state->owner) {
536 /*
537 * Bail out if user space manipulated the
538 * futex value.
539 */
540 if (pid != task_pid_vnr(pi_state->owner))
541 return -EINVAL;
542 }
535 543
536 atomic_inc(&pi_state->refcount); 544 atomic_inc(&pi_state->refcount);
537 *ps = pi_state; 545 *ps = pi_state;
@@ -547,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
547 if (!pid) 555 if (!pid)
548 return -ESRCH; 556 return -ESRCH;
549 p = futex_find_get_task(pid); 557 p = futex_find_get_task(pid);
550 if (IS_ERR(p)) 558 if (!p)
551 return PTR_ERR(p); 559 return -ESRCH;
552 560
553 /* 561 /*
554 * We need to look at the task state flags to figure out, 562 * We need to look at the task state flags to figure out,
@@ -758,6 +766,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
758 if (!pi_state) 766 if (!pi_state)
759 return -EINVAL; 767 return -EINVAL;
760 768
769 /*
770 * If current does not own the pi_state then the futex is
771 * inconsistent and user space fiddled with the futex value.
772 */
773 if (pi_state->owner != current)
774 return -EINVAL;
775
761 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 776 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
762 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 777 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
763 778
@@ -1971,7 +1986,7 @@ retry_private:
1971 /* Unqueue and drop the lock */ 1986 /* Unqueue and drop the lock */
1972 unqueue_me_pi(&q); 1987 unqueue_me_pi(&q);
1973 1988
1974 goto out; 1989 goto out_put_key;
1975 1990
1976out_unlock_put_key: 1991out_unlock_put_key:
1977 queue_unlock(&q, hb); 1992 queue_unlock(&q, hb);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 235716556bf1..d49afb2395e5 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -146,7 +146,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
146 struct task_struct *p; 146 struct task_struct *p;
147 147
148 ret = -ESRCH; 148 ret = -ESRCH;
149 read_lock(&tasklist_lock); 149 rcu_read_lock();
150 p = find_task_by_vpid(pid); 150 p = find_task_by_vpid(pid);
151 if (!p) 151 if (!p)
152 goto err_unlock; 152 goto err_unlock;
@@ -157,7 +157,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
157 !capable(CAP_SYS_PTRACE)) 157 !capable(CAP_SYS_PTRACE))
158 goto err_unlock; 158 goto err_unlock;
159 head = p->compat_robust_list; 159 head = p->compat_robust_list;
160 read_unlock(&tasklist_lock); 160 rcu_read_unlock();
161 } 161 }
162 162
163 if (put_user(sizeof(*head), len_ptr)) 163 if (put_user(sizeof(*head), len_ptr))
@@ -165,7 +165,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
165 return put_user(ptr_to_compat(head), head_ptr); 165 return put_user(ptr_to_compat(head), head_ptr);
166 166
167err_unlock: 167err_unlock:
168 read_unlock(&tasklist_lock); 168 rcu_read_unlock();
169 169
170 return ret; 170 return ret;
171} 171}
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
89 89
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = wall_to_monotonic;
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd5999588..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -243,38 +282,112 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 282 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 283 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
245 */ 284 */
246int reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
249 int ret = 0; 288 enum bp_type_idx type;
289 int weight;
250 290
251 mutex_lock(&nr_bp_mutex); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
252 294
253 fetch_bp_busy_slots(&slots, bp); 295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
254 305
255 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
256 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
257 ret = -ENOSPC; 308 return -ENOSPC;
258 goto end;
259 }
260 309
261 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
311
312 return 0;
313}
314
315int reserve_bp_slot(struct perf_event *bp)
316{
317 int ret;
318
319 mutex_lock(&nr_bp_mutex);
320
321 ret = __reserve_bp_slot(bp);
262 322
263end:
264 mutex_unlock(&nr_bp_mutex); 323 mutex_unlock(&nr_bp_mutex);
265 324
266 return ret; 325 return ret;
267} 326}
268 327
328static void __release_bp_slot(struct perf_event *bp)
329{
330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
336}
337
269void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
270{ 339{
271 mutex_lock(&nr_bp_mutex); 340 mutex_lock(&nr_bp_mutex);
272 341
273 toggle_bp_slot(bp, false); 342 __release_bp_slot(bp);
274 343
275 mutex_unlock(&nr_bp_mutex); 344 mutex_unlock(&nr_bp_mutex);
276} 345}
277 346
347/*
348 * Allow the kernel debugger to reserve breakpoint slots without
349 * taking a lock using the dbg_* variant of for the reserve and
350 * release breakpoint slots.
351 */
352int dbg_reserve_bp_slot(struct perf_event *bp)
353{
354 if (mutex_is_locked(&nr_bp_mutex))
355 return -1;
356
357 return __reserve_bp_slot(bp);
358}
359
360int dbg_release_bp_slot(struct perf_event *bp)
361{
362 if (mutex_is_locked(&nr_bp_mutex))
363 return -1;
364
365 __release_bp_slot(bp);
366
367 return 0;
368}
369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
278 391
279int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
280{ 393{
@@ -284,17 +397,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
284 if (ret) 397 if (ret)
285 return ret; 398 return ret;
286 399
287 /* 400 ret = validate_hw_breakpoint(bp);
288 * Ptrace breakpoints can be temporary perf events only 401
289 * meant to reserve a slot. In this case, it is created disabled and 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
290 * we don't want to check the params right now (as we put a null addr) 403 if (ret)
291 * But perf tools create events as disabled and we want to check 404 release_bp_slot(bp);
292 * the params for them.
293 * This is a quick hack that will be removed soon, once we remove
294 * the tmp breakpoints from ptrace
295 */
296 if (!bp->attr.disabled || !bp->overflow_handler)
297 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
298 405
299 return ret; 406 return ret;
300} 407}
@@ -324,8 +431,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
324int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) 431int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
325{ 432{
326 u64 old_addr = bp->attr.bp_addr; 433 u64 old_addr = bp->attr.bp_addr;
434 u64 old_len = bp->attr.bp_len;
327 int old_type = bp->attr.bp_type; 435 int old_type = bp->attr.bp_type;
328 int old_len = bp->attr.bp_len;
329 int err = 0; 436 int err = 0;
330 437
331 perf_event_disable(bp); 438 perf_event_disable(bp);
@@ -337,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
337 if (attr->disabled) 444 if (attr->disabled)
338 goto end; 445 goto end;
339 446
340 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
341 if (!err) 448 if (!err)
342 perf_event_enable(bp); 449 perf_event_enable(bp);
343 450
@@ -377,17 +484,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
377 * 484 *
378 * @return a set of per_cpu pointers to perf events 485 * @return a set of per_cpu pointers to perf events
379 */ 486 */
380struct perf_event ** 487struct perf_event * __percpu *
381register_wide_hw_breakpoint(struct perf_event_attr *attr, 488register_wide_hw_breakpoint(struct perf_event_attr *attr,
382 perf_overflow_handler_t triggered) 489 perf_overflow_handler_t triggered)
383{ 490{
384 struct perf_event **cpu_events, **pevent, *bp; 491 struct perf_event * __percpu *cpu_events, **pevent, *bp;
385 long err; 492 long err;
386 int cpu; 493 int cpu;
387 494
388 cpu_events = alloc_percpu(typeof(*cpu_events)); 495 cpu_events = alloc_percpu(typeof(*cpu_events));
389 if (!cpu_events) 496 if (!cpu_events)
390 return ERR_PTR(-ENOMEM); 497 return (void __percpu __force *)ERR_PTR(-ENOMEM);
391 498
392 get_online_cpus(); 499 get_online_cpus();
393 for_each_online_cpu(cpu) { 500 for_each_online_cpu(cpu) {
@@ -415,7 +522,7 @@ fail:
415 put_online_cpus(); 522 put_online_cpus();
416 523
417 free_percpu(cpu_events); 524 free_percpu(cpu_events);
418 return ERR_PTR(err); 525 return (void __percpu __force *)ERR_PTR(err);
419} 526}
420EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 527EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
421 528
@@ -423,7 +530,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
423 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 530 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
424 * @cpu_events: the per cpu set of events to unregister 531 * @cpu_events: the per cpu set of events to unregister
425 */ 532 */
426void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 533void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
427{ 534{
428 int cpu; 535 int cpu;
429 struct perf_event **pevent; 536 struct perf_event **pevent;
@@ -444,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
444 551
445static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
446{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
447 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
448} 584}
449core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
450 586
@@ -453,5 +589,4 @@ struct pmu perf_ops_bp = {
453 .enable = arch_install_hw_breakpoint, 589 .enable = arch_install_hw_breakpoint,
454 .disable = arch_uninstall_hw_breakpoint, 590 .disable = arch_uninstall_hw_breakpoint,
455 .read = hw_breakpoint_pmu_read, 591 .read = hw_breakpoint_pmu_read,
456 .unthrottle = hw_breakpoint_pmu_unthrottle
457}; 592};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22 * dynamic_irq_init - initialize a dynamically allocated irq
23 * @irq: irq number to initialize
24 */
25void dynamic_irq_init(unsigned int irq)
26{ 22{
27 struct irq_desc *desc; 23 struct irq_desc *desc;
28 unsigned long flags; 24 unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
41 desc->depth = 1; 37 desc->depth = 1;
42 desc->msi_desc = NULL; 38 desc->msi_desc = NULL;
43 desc->handler_data = NULL; 39 desc->handler_data = NULL;
44 desc->chip_data = NULL; 40 if (!keep_chip_data)
41 desc->chip_data = NULL;
45 desc->action = NULL; 42 desc->action = NULL;
46 desc->irq_count = 0; 43 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 44 desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
55} 52}
56 53
57/** 54/**
58 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 55 * dynamic_irq_init - initialize a dynamically allocated irq
59 * @irq: irq number to initialize 56 * @irq: irq number to initialize
60 */ 57 */
61void dynamic_irq_cleanup(unsigned int irq) 58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
62{ 75{
63 struct irq_desc *desc = irq_to_desc(irq); 76 struct irq_desc *desc = irq_to_desc(irq);
64 unsigned long flags; 77 unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
77 } 90 }
78 desc->msi_desc = NULL; 91 desc->msi_desc = NULL;
79 desc->handler_data = NULL; 92 desc->handler_data = NULL;
80 desc->chip_data = NULL; 93 if (!keep_chip_data)
94 desc->chip_data = NULL;
81 desc->handle_irq = handle_bad_irq; 95 desc->handle_irq = handle_bad_irq;
82 desc->chip = &no_irq_chip; 96 desc->chip = &no_irq_chip;
83 desc->name = NULL; 97 desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
85 raw_spin_unlock_irqrestore(&desc->lock, flags); 99 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 100}
87 101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
88 122
89/** 123/**
90 * set_irq_chip - set the irq chip for an irq 124 * set_irq_chip - set the irq chip for an irq
@@ -325,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
325 if (desc->chip->ack) 359 if (desc->chip->ack)
326 desc->chip->ack(irq); 360 desc->chip->ack(irq);
327 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
328} 379}
329 380
330/* 381/*
@@ -450,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
450 raw_spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
452 503
453 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
454 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq);
457out_unlock: 506out_unlock:
458 raw_spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
459} 508}
@@ -490,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
490 action = desc->action; 539 action = desc->action;
491 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
492 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
493 if (desc->chip->mask) 542 mask_irq(desc, irq);
494 desc->chip->mask(irq);
495 goto out; 543 goto out;
496 } 544 }
497 545
@@ -520,7 +568,7 @@ out:
520 * signal. The occurence is latched into the irq controller hardware 568 * signal. The occurence is latched into the irq controller hardware
521 * and must be acked in order to be reenabled. After the ack another 569 * and must be acked in order to be reenabled. After the ack another
522 * interrupt can happen on the same source even before the first one 570 * interrupt can happen on the same source even before the first one
523 * is handled by the assosiacted event handler. If this happens it 571 * is handled by the associated event handler. If this happens it
524 * might be necessary to disable (mask) the interrupt depending on the 572 * might be necessary to disable (mask) the interrupt depending on the
525 * controller hardware. This requires to reenable the interrupt inside 573 * controller hardware. This requires to reenable the interrupt inside
526 * of the loop which handles the interrupts which have arrived while 574 * of the loop which handles the interrupts which have arrived while
@@ -559,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
559 irqreturn_t action_ret; 607 irqreturn_t action_ret;
560 608
561 if (unlikely(!action)) { 609 if (unlikely(!action)) {
562 desc->chip->mask(irq); 610 mask_irq(desc, irq);
563 goto out_unlock; 611 goto out_unlock;
564 } 612 }
565 613
@@ -571,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
571 if (unlikely((desc->status & 619 if (unlikely((desc->status &
572 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
573 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
574 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
575 desc->status &= ~IRQ_MASKED;
576 } 623 }
577 624
578 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
@@ -682,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
682 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
683} 730}
684 731
685void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
686{ 733{
687 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
688 unsigned long flags; 735 unsigned long flags;
@@ -697,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
697 raw_spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 745}
699 746
700void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
701{ 748{
702 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
703 unsigned long flags; 750 unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * automatically freed on driver detach. 42 * automatically freed on driver detach.
43 * 43 *
44 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
45 * separately, dev_free_irq() must be used. 45 * separately, devm_free_irq() must be used.
46 */ 46 */
47int devm_request_threaded_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
48 irq_handler_t handler, irq_handler_t thread_fn, 48 irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
81 * Except for the extra @dev argument, this function takes the 81 * Except for the extra @dev argument, this function takes the
82 * same arguments and performs the same function as free_irq(). 82 * same arguments and performs the same function as free_irq().
83 * This function instead of free_irq() should be used to manually 83 * This function instead of free_irq() should be used to manually
84 * free IRQs allocated with dev_request_irq(). 84 * free IRQs allocated with devm_request_irq().
85 */ 85 */
86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) 86void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
87{ 87{
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
19#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 20#include <linux/rculist.h>
21#include <linux/hash.h> 21#include <linux/hash.h>
22#include <linux/bootmem.h> 22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 23#include <trace/events/irq.h>
24 24
25#include "internals.h" 25#include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{ 87{
88 void *ptr; 88 void *ptr;
89 89
90 if (slab_is_available()) 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), 91 GFP_ATOMIC, node);
92 GFP_ATOMIC, node);
93 else
94 ptr = alloc_bootmem_node(NODE_DATA(node),
95 nr * sizeof(*desc->kstat_irqs));
96 92
97 /* 93 /*
98 * don't overwite if can not get new one 94 * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
132 */ 128 */
133DEFINE_RAW_SPINLOCK(sparse_irq_lock); 129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 130
135struct irq_desc **irq_desc_ptrs __read_mostly; 131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
136 151
137static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
138 [0 ... NR_IRQS_LEGACY-1] = { 153 [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
164 legacy_count = ARRAY_SIZE(irq_desc_legacy); 179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
165 node = first_online_node; 180 node = first_online_node;
166 181
167 /* allocate irq_desc_ptrs array based on nr_irqs */
168 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
169
170 /* allocate based on nr_cpu_ids */ 182 /* allocate based on nr_cpu_ids */
171 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * 183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
172 sizeof(int), GFP_NOWAIT, node); 184 sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
180 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
181 alloc_desc_masks(&desc[i], node, true); 193 alloc_desc_masks(&desc[i], node, true);
182 init_desc_masks(&desc[i]); 194 init_desc_masks(&desc[i]);
183 irq_desc_ptrs[i] = desc + i; 195 set_irq_desc(i, &desc[i]);
184 } 196 }
185 197
186 for (i = legacy_count; i < nr_irqs; i++)
187 irq_desc_ptrs[i] = NULL;
188
189 return arch_early_irq_init(); 198 return arch_early_irq_init();
190} 199}
191 200
192struct irq_desc *irq_to_desc(unsigned int irq)
193{
194 if (irq_desc_ptrs && irq < nr_irqs)
195 return irq_desc_ptrs[irq];
196
197 return NULL;
198}
199
200struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) 201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
201{ 202{
202 struct irq_desc *desc; 203 struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
208 return NULL; 209 return NULL;
209 } 210 }
210 211
211 desc = irq_desc_ptrs[irq]; 212 desc = irq_to_desc(irq);
212 if (desc) 213 if (desc)
213 return desc; 214 return desc;
214 215
215 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 217
217 /* We have to check it to avoid races with another CPU */ 218 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 219 desc = irq_to_desc(irq);
219 if (desc) 220 if (desc)
220 goto out_unlock; 221 goto out_unlock;
221 222
222 if (slab_is_available()) 223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224 else
225 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
226 224
227 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); 225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
228 if (!desc) { 226 if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
231 } 229 }
232 init_one_irq_desc(irq, desc, node); 230 init_one_irq_desc(irq, desc, node);
233 231
234 irq_desc_ptrs[irq] = desc; 232 set_irq_desc(irq, desc);
235 233
236out_unlock: 234out_unlock:
237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
@@ -372,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
372 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
373 unsigned int status = 0; 371 unsigned int status = 0;
374 372
375 if (!(action->flags & IRQF_DISABLED))
376 local_irq_enable_in_hardirq();
377
378 do { 373 do {
379 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
380 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
25extern struct irq_desc **irq_desc_ptrs;
26#else
27/* irq_desc_ptrs is a fixed size array */
28extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
29#endif 25#endif
30 26
31#ifdef CONFIG_PROC_FS 27#ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..e1497481fe8a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -382,6 +398,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 398{
383 struct irq_desc *desc = irq_to_desc(irq); 399 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 400 struct irqaction *action;
401 unsigned long flags;
385 402
386 if (!desc) 403 if (!desc)
387 return 0; 404 return 0;
@@ -389,11 +406,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 406 if (desc->status & IRQ_NOREQUEST)
390 return 0; 407 return 0;
391 408
409 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 410 action = desc->action;
393 if (action) 411 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 412 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 413 action = NULL;
396 414
415 raw_spin_unlock_irqrestore(&desc->lock, flags);
416
397 return !action; 417 return !action;
398} 418}
399 419
@@ -436,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
437 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
438 desc->status |= flags; 458 desc->status |= flags;
459
460 if (chip != desc->chip)
461 irq_chip_set_defaults(desc->chip);
439 } 462 }
440 463
441 return ret; 464 return ret;
@@ -483,8 +506,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 506 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 508{
509again:
486 chip_bus_lock(irq, desc); 510 chip_bus_lock(irq, desc);
487 raw_spin_lock_irq(&desc->lock); 511 raw_spin_lock_irq(&desc->lock);
512
513 /*
514 * Implausible though it may be we need to protect us against
515 * the following scenario:
516 *
517 * The thread is faster done than the hard interrupt handler
518 * on the other CPU. If we unmask the irq line then the
519 * interrupt can come in again and masks the line, leaves due
520 * to IRQ_INPROGRESS and the irq line is masked forever.
521 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc);
525 cpu_relax();
526 goto again;
527 }
528
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 530 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 531 desc->chip->unmask(irq);
@@ -884,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 925 desc->chip->disable(irq);
885 } 926 }
886 927
928#ifdef CONFIG_SMP
929 /* make sure affinity_hint is cleaned up */
930 if (WARN_ON_ONCE(desc->affinity_hint))
931 desc->affinity_hint = NULL;
932#endif
933
887 raw_spin_unlock_irqrestore(&desc->lock, flags); 934 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 935
889 unregister_handler_proc(irq, action); 936 unregister_handler_proc(irq, action);
@@ -995,7 +1042,6 @@ EXPORT_SYMBOL(free_irq);
995 * Flags: 1042 * Flags:
996 * 1043 *
997 * IRQF_SHARED Interrupt is shared 1044 * IRQF_SHARED Interrupt is shared
998 * IRQF_DISABLED Disable local interrupts while processing
999 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1045 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1000 * IRQF_TRIGGER_* Specify active edge(s) or level 1046 * IRQF_TRIGGER_* Specify active edge(s) or level
1001 * 1047 *
@@ -1009,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1009 int retval; 1055 int retval;
1010 1056
1011 /* 1057 /*
1012 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1013 * the _first_ irqaction (sigh). That can cause oopsing, but
1014 * the behavior is classified as "will not fix" so we need to
1015 * start nudging drivers away from using that idiom.
1016 */
1017 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1018 (IRQF_SHARED|IRQF_DISABLED)) {
1019 pr_warning(
1020 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1021 irq, devname);
1022 }
1023
1024#ifdef CONFIG_LOCKDEP
1025 /*
1026 * Lockdep wants atomic interrupt handlers:
1027 */
1028 irqflags |= IRQF_DISABLED;
1029#endif
1030 /*
1031 * Sanity-check: shared interrupts must pass in a real dev-ID, 1058 * Sanity-check: shared interrupts must pass in a real dev-ID,
1032 * otherwise we'll have trouble later trying to figure out 1059 * otherwise we'll have trouble later trying to figure out
1033 * which interrupt is which (messes up the interrupt freeing 1060 * which interrupt is which (messes up the interrupt freeing
@@ -1088,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1088 return retval; 1115 return retval;
1089} 1116}
1090EXPORT_SYMBOL(request_threaded_irq); 1117EXPORT_SYMBOL(request_threaded_irq);
1118
1119/**
1120 * request_any_context_irq - allocate an interrupt line
1121 * @irq: Interrupt line to allocate
1122 * @handler: Function to be called when the IRQ occurs.
1123 * Threaded handler for threaded interrupts.
1124 * @flags: Interrupt type flags
1125 * @name: An ascii name for the claiming device
1126 * @dev_id: A cookie passed back to the handler function
1127 *
1128 * This call allocates interrupt resources and enables the
1129 * interrupt line and IRQ handling. It selects either a
1130 * hardirq or threaded handling method depending on the
1131 * context.
1132 *
1133 * On failure, it returns a negative value. On success,
1134 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1135 */
1136int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1137 unsigned long flags, const char *name, void *dev_id)
1138{
1139 struct irq_desc *desc = irq_to_desc(irq);
1140 int ret;
1141
1142 if (!desc)
1143 return -EINVAL;
1144
1145 if (desc->status & IRQ_NESTED_THREAD) {
1146 ret = request_threaded_irq(irq, NULL, handler,
1147 flags, name, dev_id);
1148 return !ret ? IRQC_IS_NESTED : ret;
1149 }
1150
1151 ret = request_irq(irq, handler, flags, name, dev_id);
1152 return !ret ? IRQC_IS_HARDIRQ : ret;
1153}
1154EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
6 */ 6 */
7 7
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
@@ -70,7 +71,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 raw_spin_lock_irqsave(&sparse_irq_lock, flags); 71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 72
72 /* We have to check it to avoid races with another CPU */ 73 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 74 desc = irq_to_desc(irq);
74 75
75 if (desc && old_desc != desc) 76 if (desc && old_desc != desc)
76 goto out_unlock; 77 goto out_unlock;
@@ -90,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
90 goto out_unlock; 91 goto out_unlock;
91 } 92 }
92 93
93 irq_desc_ptrs[irq] = desc; 94 replace_irq_desc(irq, desc);
94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); 95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 96
96 /* free the old one */ 97 /* free the old one */
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/gfp.h>
10#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
11#include <linux/seq_file.h> 12#include <linux/seq_file.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
@@ -31,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
31 return 0; 32 return 0;
32} 33}
33 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
34#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
35#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
36#endif 58#endif
@@ -83,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
83 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
84} 106}
85 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
86static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
87 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
88 .read = seq_read, 115 .read = seq_read,
@@ -91,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
91 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
92}; 119};
93 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
94static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
95{ 129{
96 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -146,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
146 .release = single_release, 180 .release = single_release,
147 .write = default_affinity_write, 181 .write = default_affinity_write,
148}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
149#endif 203#endif
150 204
151static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
230 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
231 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
232 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
233#endif 294#endif
234 295
235 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,11 +16,13 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kdb.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 22#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/slab.h>
24 26
25#include <asm/sections.h> 27#include <asm/sections.h>
26 28
@@ -515,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
515 return ret; 517 return ret;
516} 518}
517 519
520#ifdef CONFIG_KGDB_KDB
521const char *kdb_walk_kallsyms(loff_t *pos)
522{
523 static struct kallsym_iter kdb_walk_kallsyms_iter;
524 if (*pos == 0) {
525 memset(&kdb_walk_kallsyms_iter, 0,
526 sizeof(kdb_walk_kallsyms_iter));
527 reset_iter(&kdb_walk_kallsyms_iter, 0);
528 }
529 while (1) {
530 if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
531 return NULL;
532 ++*pos;
533 /* Some debugging symbols have no name. Ignore them. */
534 if (kdb_walk_kallsyms_iter.name[0])
535 return kdb_walk_kallsyms_iter.name;
536 }
537}
538#endif /* CONFIG_KGDB_KDB */
539
518static const struct file_operations kallsyms_operations = { 540static const struct file_operations kallsyms_operations = {
519 .open = kallsyms_open, 541 .open = kallsyms_open,
520 .read = seq_read, 542 .read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
41#include <asm/sections.h> 41#include <asm/sections.h>
42 42
43/* Per cpu memory for storing cpu states in case of system crash. */ 43/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t* crash_notes; 44note_buf_t __percpu *crash_notes;
45 45
46/* vmcoreinfo stuff */ 46/* vmcoreinfo stuff */
47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; 47static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
1089 1089
1090size_t crash_get_memory_size(void) 1090size_t crash_get_memory_size(void)
1091{ 1091{
1092 size_t size; 1092 size_t size = 0;
1093 mutex_lock(&kexec_mutex); 1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1; 1094 if (crashk_res.end != crashk_res.start)
1095 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex); 1096 mutex_unlock(&kexec_mutex);
1096 return size; 1097 return size;
1097} 1098}
@@ -1134,11 +1135,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1135
1135 free_reserved_phys_range(end, crashk_res.end); 1136 free_reserved_phys_range(end, crashk_res.end);
1136 1137
1137 if (start == end) { 1138 if ((start == end) && (crashk_res.parent != NULL))
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1139 release_resource(&crashk_res);
1140 } else 1140 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1141
1143unlock: 1142unlock:
1144 mutex_unlock(&kexec_mutex); 1143 mutex_unlock(&kexec_mutex);
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 32c5c15d750d..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
80 80
81 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
82 if (!buffer) { 82 if (!buffer) {
83 _kfifo_init(fifo, 0, 0); 83 _kfifo_init(fifo, NULL, 0);
84 return -ENOMEM; 84 return -ENOMEM;
85 } 85 }
86 86
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
97void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
98{ 98{
99 kfree(fifo->buffer); 99 kfree(fifo->buffer);
100 _kfifo_init(fifo, NULL, 0);
100} 101}
101EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
102 103
@@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n);
349 * @fifo: the fifo to be used. 350 * @fifo: the fifo to be used.
350 * @from: pointer to the data to be added. 351 * @from: pointer to the data to be added.
351 * @len: the length of the data to be added. 352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
352 * 354 *
353 * This function copies at most @len bytes from the @from into the 355 * This function copies at most @len bytes from the @from into the
354 * FIFO depending and returns -EFAULT/0. 356 * FIFO depending and returns -EFAULT/0.
@@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n);
399 * @fifo: the fifo to be used. 401 * @fifo: the fifo to be used.
400 * @to: where the data must be copied. 402 * @to: where the data must be copied.
401 * @len: the size of the destination buffer. 403 * @len: the size of the destination buffer.
402 @ @lenout: pointer to output variable with copied data 404 * @lenout: pointer to output variable with copied data
403 * 405 *
404 * This function copies at most @len bytes from the FIFO into the 406 * This function copies at most @len bytes from the FIFO into the
405 * @to buffer and 0 or -EFAULT. 407 * @to buffer and 0 or -EFAULT.
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 2eb517e23514..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1760 +0,0 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55#include <asm/unaligned.h>
56
57static int kgdb_break_asap;
58
59#define KGDB_MAX_THREAD_QUERY 17
60struct kgdb_state {
61 int ex_vector;
62 int signo;
63 int err_code;
64 int cpu;
65 int pass_exception;
66 unsigned long thr_query;
67 unsigned long threadid;
68 long kgdb_usethreadid;
69 struct pt_regs *linux_regs;
70};
71
72static struct debuggerinfo_struct {
73 void *debuggerinfo;
74 struct task_struct *task;
75} kgdb_info[NR_CPUS];
76
77/**
78 * kgdb_connected - Is a host GDB connected to us?
79 */
80int kgdb_connected;
81EXPORT_SYMBOL_GPL(kgdb_connected);
82
83/* All the KGDB handlers are installed */
84static int kgdb_io_module_registered;
85
86/* Guard for recursive entry */
87static int exception_level;
88
89static struct kgdb_io *kgdb_io_ops;
90static DEFINE_SPINLOCK(kgdb_registration_lock);
91
92/* kgdb console driver is loaded */
93static int kgdb_con_registered;
94/* determine if kgdb console output should be used */
95static int kgdb_use_con;
96
97static int __init opt_kgdb_con(char *str)
98{
99 kgdb_use_con = 1;
100 return 0;
101}
102
103early_param("kgdbcon", opt_kgdb_con);
104
105module_param(kgdb_use_con, int, 0644);
106
107/*
108 * Holds information about breakpoints in a kernel. These breakpoints are
109 * added and removed by gdb.
110 */
111static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
112 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
113};
114
115/*
116 * The CPU# of the active CPU, or -1 if none:
117 */
118atomic_t kgdb_active = ATOMIC_INIT(-1);
119
120/*
121 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
122 * bootup code (which might not have percpu set up yet):
123 */
124static atomic_t passive_cpu_wait[NR_CPUS];
125static atomic_t cpu_in_kgdb[NR_CPUS];
126atomic_t kgdb_setting_breakpoint;
127
128struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread;
130
131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
133
134/* Our I/O buffers. */
135static char remcom_in_buffer[BUFMAX];
136static char remcom_out_buffer[BUFMAX];
137
138/* Storage for the registers, in GDB format. */
139static unsigned long gdb_regs[(NUMREGBYTES +
140 sizeof(unsigned long) - 1) /
141 sizeof(unsigned long)];
142
143/* to keep track of the CPU which is doing the single stepping*/
144atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
145
146/*
147 * If you are debugging a problem where roundup (the collection of
148 * all other CPUs) is a problem [this should be extremely rare],
149 * then use the nokgdbroundup option to avoid roundup. In that case
150 * the other CPUs might interfere with your debugging context, so
151 * use this with care:
152 */
153static int kgdb_do_roundup = 1;
154
155static int __init opt_nokgdbroundup(char *str)
156{
157 kgdb_do_roundup = 0;
158
159 return 0;
160}
161
162early_param("nokgdbroundup", opt_nokgdbroundup);
163
164/*
165 * Finally, some KGDB code :-)
166 */
167
168/*
169 * Weak aliases for breakpoint management,
170 * can be overriden by architectures when needed:
171 */
172int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
173{
174 int err;
175
176 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
177 if (err)
178 return err;
179
180 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
181 BREAK_INSTR_SIZE);
182}
183
184int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
185{
186 return probe_kernel_write((char *)addr,
187 (char *)bundle, BREAK_INSTR_SIZE);
188}
189
190int __weak kgdb_validate_break_address(unsigned long addr)
191{
192 char tmp_variable[BREAK_INSTR_SIZE];
193 int err;
194 /* Validate setting the breakpoint and then removing it. In the
195 * remove fails, the kernel needs to emit a bad message because we
196 * are deep trouble not being able to put things back the way we
197 * found them.
198 */
199 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
200 if (err)
201 return err;
202 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
203 if (err)
204 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
205 "memory destroyed at: %lx", addr);
206 return err;
207}
208
209unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
210{
211 return instruction_pointer(regs);
212}
213
214int __weak kgdb_arch_init(void)
215{
216 return 0;
217}
218
219int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
220{
221 return 0;
222}
223
224void __weak
225kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
226{
227 return;
228}
229
230/**
231 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
232 * @regs: Current &struct pt_regs.
233 *
234 * This function will be called if the particular architecture must
235 * disable hardware debugging while it is processing gdb packets or
236 * handling exception.
237 */
238void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
239{
240}
241
242/*
243 * GDB remote protocol parser:
244 */
245
246static int hex(char ch)
247{
248 if ((ch >= 'a') && (ch <= 'f'))
249 return ch - 'a' + 10;
250 if ((ch >= '0') && (ch <= '9'))
251 return ch - '0';
252 if ((ch >= 'A') && (ch <= 'F'))
253 return ch - 'A' + 10;
254 return -1;
255}
256
257/* scan for the sequence $<data>#<checksum> */
258static void get_packet(char *buffer)
259{
260 unsigned char checksum;
261 unsigned char xmitcsum;
262 int count;
263 char ch;
264
265 do {
266 /*
267 * Spin and wait around for the start character, ignore all
268 * other characters:
269 */
270 while ((ch = (kgdb_io_ops->read_char())) != '$')
271 /* nothing */;
272
273 kgdb_connected = 1;
274 checksum = 0;
275 xmitcsum = -1;
276
277 count = 0;
278
279 /*
280 * now, read until a # or end of buffer is found:
281 */
282 while (count < (BUFMAX - 1)) {
283 ch = kgdb_io_ops->read_char();
284 if (ch == '#')
285 break;
286 checksum = checksum + ch;
287 buffer[count] = ch;
288 count = count + 1;
289 }
290 buffer[count] = 0;
291
292 if (ch == '#') {
293 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
294 xmitcsum += hex(kgdb_io_ops->read_char());
295
296 if (checksum != xmitcsum)
297 /* failed checksum */
298 kgdb_io_ops->write_char('-');
299 else
300 /* successful transfer */
301 kgdb_io_ops->write_char('+');
302 if (kgdb_io_ops->flush)
303 kgdb_io_ops->flush();
304 }
305 } while (checksum != xmitcsum);
306}
307
308/*
309 * Send the packet in buffer.
310 * Check for gdb connection if asked for.
311 */
312static void put_packet(char *buffer)
313{
314 unsigned char checksum;
315 int count;
316 char ch;
317
318 /*
319 * $<packet info>#<checksum>.
320 */
321 while (1) {
322 kgdb_io_ops->write_char('$');
323 checksum = 0;
324 count = 0;
325
326 while ((ch = buffer[count])) {
327 kgdb_io_ops->write_char(ch);
328 checksum += ch;
329 count++;
330 }
331
332 kgdb_io_ops->write_char('#');
333 kgdb_io_ops->write_char(hex_asc_hi(checksum));
334 kgdb_io_ops->write_char(hex_asc_lo(checksum));
335 if (kgdb_io_ops->flush)
336 kgdb_io_ops->flush();
337
338 /* Now see what we get in reply. */
339 ch = kgdb_io_ops->read_char();
340
341 if (ch == 3)
342 ch = kgdb_io_ops->read_char();
343
344 /* If we get an ACK, we are done. */
345 if (ch == '+')
346 return;
347
348 /*
349 * If we get the start of another packet, this means
350 * that GDB is attempting to reconnect. We will NAK
351 * the packet being sent, and stop trying to send this
352 * packet.
353 */
354 if (ch == '$') {
355 kgdb_io_ops->write_char('-');
356 if (kgdb_io_ops->flush)
357 kgdb_io_ops->flush();
358 return;
359 }
360 }
361}
362
363/*
364 * Convert the memory pointed to by mem into hex, placing result in buf.
365 * Return a pointer to the last char put in buf (null). May return an error.
366 */
367int kgdb_mem2hex(char *mem, char *buf, int count)
368{
369 char *tmp;
370 int err;
371
372 /*
373 * We use the upper half of buf as an intermediate buffer for the
374 * raw memory copy. Hex conversion will work against this one.
375 */
376 tmp = buf + count;
377
378 err = probe_kernel_read(tmp, mem, count);
379 if (!err) {
380 while (count > 0) {
381 buf = pack_hex_byte(buf, *tmp);
382 tmp++;
383 count--;
384 }
385
386 *buf = 0;
387 }
388
389 return err;
390}
391
392/*
393 * Copy the binary array pointed to by buf into mem. Fix $, #, and
394 * 0x7d escaped with 0x7d. Return a pointer to the character after
395 * the last byte written.
396 */
397static int kgdb_ebin2mem(char *buf, char *mem, int count)
398{
399 int err = 0;
400 char c;
401
402 while (count-- > 0) {
403 c = *buf++;
404 if (c == 0x7d)
405 c = *buf++ ^ 0x20;
406
407 err = probe_kernel_write(mem, &c, 1);
408 if (err)
409 break;
410
411 mem++;
412 }
413
414 return err;
415}
416
417/*
418 * Convert the hex array pointed to by buf into binary to be placed in mem.
419 * Return a pointer to the character AFTER the last byte written.
420 * May return an error.
421 */
422int kgdb_hex2mem(char *buf, char *mem, int count)
423{
424 char *tmp_raw;
425 char *tmp_hex;
426
427 /*
428 * We use the upper half of buf as an intermediate buffer for the
429 * raw memory that is converted from hex.
430 */
431 tmp_raw = buf + count * 2;
432
433 tmp_hex = tmp_raw - 1;
434 while (tmp_hex >= buf) {
435 tmp_raw--;
436 *tmp_raw = hex(*tmp_hex--);
437 *tmp_raw |= hex(*tmp_hex--) << 4;
438 }
439
440 return probe_kernel_write(mem, tmp_raw, count);
441}
442
443/*
444 * While we find nice hex chars, build a long_val.
445 * Return number of chars processed.
446 */
447int kgdb_hex2long(char **ptr, unsigned long *long_val)
448{
449 int hex_val;
450 int num = 0;
451 int negate = 0;
452
453 *long_val = 0;
454
455 if (**ptr == '-') {
456 negate = 1;
457 (*ptr)++;
458 }
459 while (**ptr) {
460 hex_val = hex(**ptr);
461 if (hex_val < 0)
462 break;
463
464 *long_val = (*long_val << 4) | hex_val;
465 num++;
466 (*ptr)++;
467 }
468
469 if (negate)
470 *long_val = -*long_val;
471
472 return num;
473}
474
475/* Write memory due to an 'M' or 'X' packet. */
476static int write_mem_msg(int binary)
477{
478 char *ptr = &remcom_in_buffer[1];
479 unsigned long addr;
480 unsigned long length;
481 int err;
482
483 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
484 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
485 if (binary)
486 err = kgdb_ebin2mem(ptr, (char *)addr, length);
487 else
488 err = kgdb_hex2mem(ptr, (char *)addr, length);
489 if (err)
490 return err;
491 if (CACHE_FLUSH_IS_SAFE)
492 flush_icache_range(addr, addr + length);
493 return 0;
494 }
495
496 return -EINVAL;
497}
498
499static void error_packet(char *pkt, int error)
500{
501 error = -error;
502 pkt[0] = 'E';
503 pkt[1] = hex_asc[(error / 10)];
504 pkt[2] = hex_asc[(error % 10)];
505 pkt[3] = '\0';
506}
507
508/*
509 * Thread ID accessors. We represent a flat TID space to GDB, where
510 * the per CPU idle threads (which under Linux all have PID 0) are
511 * remapped to negative TIDs.
512 */
513
514#define BUF_THREAD_ID_SIZE 16
515
516static char *pack_threadid(char *pkt, unsigned char *id)
517{
518 char *limit;
519
520 limit = pkt + BUF_THREAD_ID_SIZE;
521 while (pkt < limit)
522 pkt = pack_hex_byte(pkt, *id++);
523
524 return pkt;
525}
526
527static void int_to_threadref(unsigned char *id, int value)
528{
529 unsigned char *scan;
530 int i = 4;
531
532 scan = (unsigned char *)id;
533 while (i--)
534 *scan++ = 0;
535 put_unaligned_be32(value, scan);
536}
537
538static struct task_struct *getthread(struct pt_regs *regs, int tid)
539{
540 /*
541 * Non-positive TIDs are remapped to the cpu shadow information
542 */
543 if (tid == 0 || tid == -1)
544 tid = -atomic_read(&kgdb_active) - 2;
545 if (tid < -1 && tid > -NR_CPUS - 2) {
546 if (kgdb_info[-tid - 2].task)
547 return kgdb_info[-tid - 2].task;
548 else
549 return idle_task(-tid - 2);
550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
556
557 /*
558 * find_task_by_pid_ns() does not take the tasklist lock anymore
559 * but is nicely RCU locked - hence is a pretty resilient
560 * thing to use:
561 */
562 return find_task_by_pid_ns(tid, &init_pid_ns);
563}
564
565/*
566 * CPU debug state control:
567 */
568
569#ifdef CONFIG_SMP
570static void kgdb_wait(struct pt_regs *regs)
571{
572 unsigned long flags;
573 int cpu;
574
575 local_irq_save(flags);
576 cpu = raw_smp_processor_id();
577 kgdb_info[cpu].debuggerinfo = regs;
578 kgdb_info[cpu].task = current;
579 /*
580 * Make sure the above info reaches the primary CPU before
581 * our cpu_in_kgdb[] flag setting does:
582 */
583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1);
585
586 /* Wait till primary CPU is done with debugging */
587 while (atomic_read(&passive_cpu_wait[cpu]))
588 cpu_relax();
589
590 kgdb_info[cpu].debuggerinfo = NULL;
591 kgdb_info[cpu].task = NULL;
592
593 /* fix up hardware debug registers on local cpu */
594 if (arch_kgdb_ops.correct_hw_break)
595 arch_kgdb_ops.correct_hw_break();
596
597 /* Signal the primary CPU that we are done: */
598 atomic_set(&cpu_in_kgdb[cpu], 0);
599 touch_softlockup_watchdog();
600 clocksource_touch_watchdog();
601 local_irq_restore(flags);
602}
603#endif
604
605/*
606 * Some architectures need cache flushes when we set/clear a
607 * breakpoint:
608 */
609static void kgdb_flush_swbreak_addr(unsigned long addr)
610{
611 if (!CACHE_FLUSH_IS_SAFE)
612 return;
613
614 if (current->mm && current->mm->mmap_cache) {
615 flush_cache_range(current->mm->mmap_cache,
616 addr, addr + BREAK_INSTR_SIZE);
617 }
618 /* Force flush instruction cache if it was outside the mm */
619 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
620}
621
622/*
623 * SW breakpoint management:
624 */
625static int kgdb_activate_sw_breakpoints(void)
626{
627 unsigned long addr;
628 int error;
629 int ret = 0;
630 int i;
631
632 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
633 if (kgdb_break[i].state != BP_SET)
634 continue;
635
636 addr = kgdb_break[i].bpt_addr;
637 error = kgdb_arch_set_breakpoint(addr,
638 kgdb_break[i].saved_instr);
639 if (error) {
640 ret = error;
641 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
642 continue;
643 }
644
645 kgdb_flush_swbreak_addr(addr);
646 kgdb_break[i].state = BP_ACTIVE;
647 }
648 return ret;
649}
650
651static int kgdb_set_sw_break(unsigned long addr)
652{
653 int err = kgdb_validate_break_address(addr);
654 int breakno = -1;
655 int i;
656
657 if (err)
658 return err;
659
660 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
661 if ((kgdb_break[i].state == BP_SET) &&
662 (kgdb_break[i].bpt_addr == addr))
663 return -EEXIST;
664 }
665 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
666 if (kgdb_break[i].state == BP_REMOVED &&
667 kgdb_break[i].bpt_addr == addr) {
668 breakno = i;
669 break;
670 }
671 }
672
673 if (breakno == -1) {
674 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
675 if (kgdb_break[i].state == BP_UNDEFINED) {
676 breakno = i;
677 break;
678 }
679 }
680 }
681
682 if (breakno == -1)
683 return -E2BIG;
684
685 kgdb_break[breakno].state = BP_SET;
686 kgdb_break[breakno].type = BP_BREAKPOINT;
687 kgdb_break[breakno].bpt_addr = addr;
688
689 return 0;
690}
691
692static int kgdb_deactivate_sw_breakpoints(void)
693{
694 unsigned long addr;
695 int error;
696 int ret = 0;
697 int i;
698
699 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
700 if (kgdb_break[i].state != BP_ACTIVE)
701 continue;
702 addr = kgdb_break[i].bpt_addr;
703 error = kgdb_arch_remove_breakpoint(addr,
704 kgdb_break[i].saved_instr);
705 if (error) {
706 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
707 ret = error;
708 }
709
710 kgdb_flush_swbreak_addr(addr);
711 kgdb_break[i].state = BP_SET;
712 }
713 return ret;
714}
715
716static int kgdb_remove_sw_break(unsigned long addr)
717{
718 int i;
719
720 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
721 if ((kgdb_break[i].state == BP_SET) &&
722 (kgdb_break[i].bpt_addr == addr)) {
723 kgdb_break[i].state = BP_REMOVED;
724 return 0;
725 }
726 }
727 return -ENOENT;
728}
729
730int kgdb_isremovedbreak(unsigned long addr)
731{
732 int i;
733
734 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
735 if ((kgdb_break[i].state == BP_REMOVED) &&
736 (kgdb_break[i].bpt_addr == addr))
737 return 1;
738 }
739 return 0;
740}
741
742static int remove_all_break(void)
743{
744 unsigned long addr;
745 int error;
746 int i;
747
748 /* Clear memory breakpoints. */
749 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
750 if (kgdb_break[i].state != BP_ACTIVE)
751 goto setundefined;
752 addr = kgdb_break[i].bpt_addr;
753 error = kgdb_arch_remove_breakpoint(addr,
754 kgdb_break[i].saved_instr);
755 if (error)
756 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
757 addr);
758setundefined:
759 kgdb_break[i].state = BP_UNDEFINED;
760 }
761
762 /* Clear hardware breakpoints. */
763 if (arch_kgdb_ops.remove_all_hw_break)
764 arch_kgdb_ops.remove_all_hw_break();
765
766 return 0;
767}
768
769/*
770 * Remap normal tasks to their real PID,
771 * CPU shadow threads are mapped to -CPU - 2
772 */
773static inline int shadow_pid(int realpid)
774{
775 if (realpid)
776 return realpid;
777
778 return -raw_smp_processor_id() - 2;
779}
780
781static char gdbmsgbuf[BUFMAX + 1];
782
783static void kgdb_msg_write(const char *s, int len)
784{
785 char *bufptr;
786 int wcount;
787 int i;
788
789 /* 'O'utput */
790 gdbmsgbuf[0] = 'O';
791
792 /* Fill and send buffers... */
793 while (len > 0) {
794 bufptr = gdbmsgbuf + 1;
795
796 /* Calculate how many this time */
797 if ((len << 1) > (BUFMAX - 2))
798 wcount = (BUFMAX - 2) >> 1;
799 else
800 wcount = len;
801
802 /* Pack in hex chars */
803 for (i = 0; i < wcount; i++)
804 bufptr = pack_hex_byte(bufptr, s[i]);
805 *bufptr = '\0';
806
807 /* Move up */
808 s += wcount;
809 len -= wcount;
810
811 /* Write packet */
812 put_packet(gdbmsgbuf);
813 }
814}
815
816/*
817 * Return true if there is a valid kgdb I/O module. Also if no
818 * debugger is attached a message can be printed to the console about
819 * waiting for the debugger to attach.
820 *
821 * The print_wait argument is only to be true when called from inside
822 * the core kgdb_handle_exception, because it will wait for the
823 * debugger to attach.
824 */
825static int kgdb_io_ready(int print_wait)
826{
827 if (!kgdb_io_ops)
828 return 0;
829 if (kgdb_connected)
830 return 1;
831 if (atomic_read(&kgdb_setting_breakpoint))
832 return 1;
833 if (print_wait)
834 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
835 return 1;
836}
837
838/*
839 * All the functions that start with gdb_cmd are the various
840 * operations to implement the handlers for the gdbserial protocol
841 * where KGDB is communicating with an external debugger
842 */
843
844/* Handle the '?' status packets */
845static void gdb_cmd_status(struct kgdb_state *ks)
846{
847 /*
848 * We know that this packet is only sent
849 * during initial connect. So to be safe,
850 * we clear out our breakpoints now in case
851 * GDB is reconnecting.
852 */
853 remove_all_break();
854
855 remcom_out_buffer[0] = 'S';
856 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
857}
858
859/* Handle the 'g' get registers request */
860static void gdb_cmd_getregs(struct kgdb_state *ks)
861{
862 struct task_struct *thread;
863 void *local_debuggerinfo;
864 int i;
865
866 thread = kgdb_usethread;
867 if (!thread) {
868 thread = kgdb_info[ks->cpu].task;
869 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
870 } else {
871 local_debuggerinfo = NULL;
872 for_each_online_cpu(i) {
873 /*
874 * Try to find the task on some other
875 * or possibly this node if we do not
876 * find the matching task then we try
877 * to approximate the results.
878 */
879 if (thread == kgdb_info[i].task)
880 local_debuggerinfo = kgdb_info[i].debuggerinfo;
881 }
882 }
883
884 /*
885 * All threads that don't have debuggerinfo should be
886 * in schedule() sleeping, since all other CPUs
887 * are in kgdb_wait, and thus have debuggerinfo.
888 */
889 if (local_debuggerinfo) {
890 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
891 } else {
892 /*
893 * Pull stuff saved during switch_to; nothing
894 * else is accessible (or even particularly
895 * relevant).
896 *
897 * This should be enough for a stack trace.
898 */
899 sleeping_thread_to_gdb_regs(gdb_regs, thread);
900 }
901 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
902}
903
904/* Handle the 'G' set registers request */
905static void gdb_cmd_setregs(struct kgdb_state *ks)
906{
907 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
908
909 if (kgdb_usethread && kgdb_usethread != current) {
910 error_packet(remcom_out_buffer, -EINVAL);
911 } else {
912 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
913 strcpy(remcom_out_buffer, "OK");
914 }
915}
916
917/* Handle the 'm' memory read bytes */
918static void gdb_cmd_memread(struct kgdb_state *ks)
919{
920 char *ptr = &remcom_in_buffer[1];
921 unsigned long length;
922 unsigned long addr;
923 int err;
924
925 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
926 kgdb_hex2long(&ptr, &length) > 0) {
927 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
928 if (err)
929 error_packet(remcom_out_buffer, err);
930 } else {
931 error_packet(remcom_out_buffer, -EINVAL);
932 }
933}
934
935/* Handle the 'M' memory write bytes */
936static void gdb_cmd_memwrite(struct kgdb_state *ks)
937{
938 int err = write_mem_msg(0);
939
940 if (err)
941 error_packet(remcom_out_buffer, err);
942 else
943 strcpy(remcom_out_buffer, "OK");
944}
945
946/* Handle the 'X' memory binary write bytes */
947static void gdb_cmd_binwrite(struct kgdb_state *ks)
948{
949 int err = write_mem_msg(1);
950
951 if (err)
952 error_packet(remcom_out_buffer, err);
953 else
954 strcpy(remcom_out_buffer, "OK");
955}
956
957/* Handle the 'D' or 'k', detach or kill packets */
958static void gdb_cmd_detachkill(struct kgdb_state *ks)
959{
960 int error;
961
962 /* The detach case */
963 if (remcom_in_buffer[0] == 'D') {
964 error = remove_all_break();
965 if (error < 0) {
966 error_packet(remcom_out_buffer, error);
967 } else {
968 strcpy(remcom_out_buffer, "OK");
969 kgdb_connected = 0;
970 }
971 put_packet(remcom_out_buffer);
972 } else {
973 /*
974 * Assume the kill case, with no exit code checking,
975 * trying to force detach the debugger:
976 */
977 remove_all_break();
978 kgdb_connected = 0;
979 }
980}
981
982/* Handle the 'R' reboot packets */
983static int gdb_cmd_reboot(struct kgdb_state *ks)
984{
985 /* For now, only honor R0 */
986 if (strcmp(remcom_in_buffer, "R0") == 0) {
987 printk(KERN_CRIT "Executing emergency reboot\n");
988 strcpy(remcom_out_buffer, "OK");
989 put_packet(remcom_out_buffer);
990
991 /*
992 * Execution should not return from
993 * machine_emergency_restart()
994 */
995 machine_emergency_restart();
996 kgdb_connected = 0;
997
998 return 1;
999 }
1000 return 0;
1001}
1002
1003/* Handle the 'q' query packets */
1004static void gdb_cmd_query(struct kgdb_state *ks)
1005{
1006 struct task_struct *g;
1007 struct task_struct *p;
1008 unsigned char thref[8];
1009 char *ptr;
1010 int i;
1011 int cpu;
1012 int finished = 0;
1013
1014 switch (remcom_in_buffer[1]) {
1015 case 's':
1016 case 'f':
1017 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
1018 error_packet(remcom_out_buffer, -EINVAL);
1019 break;
1020 }
1021
1022 i = 0;
1023 remcom_out_buffer[0] = 'm';
1024 ptr = remcom_out_buffer + 1;
1025 if (remcom_in_buffer[1] == 'f') {
1026 /* Each cpu is a shadow thread */
1027 for_each_online_cpu(cpu) {
1028 ks->thr_query = 0;
1029 int_to_threadref(thref, -cpu - 2);
1030 pack_threadid(ptr, thref);
1031 ptr += BUF_THREAD_ID_SIZE;
1032 *(ptr++) = ',';
1033 i++;
1034 }
1035 }
1036
1037 do_each_thread(g, p) {
1038 if (i >= ks->thr_query && !finished) {
1039 int_to_threadref(thref, p->pid);
1040 pack_threadid(ptr, thref);
1041 ptr += BUF_THREAD_ID_SIZE;
1042 *(ptr++) = ',';
1043 ks->thr_query++;
1044 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1045 finished = 1;
1046 }
1047 i++;
1048 } while_each_thread(g, p);
1049
1050 *(--ptr) = '\0';
1051 break;
1052
1053 case 'C':
1054 /* Current thread id */
1055 strcpy(remcom_out_buffer, "QC");
1056 ks->threadid = shadow_pid(current->pid);
1057 int_to_threadref(thref, ks->threadid);
1058 pack_threadid(remcom_out_buffer + 2, thref);
1059 break;
1060 case 'T':
1061 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1062 error_packet(remcom_out_buffer, -EINVAL);
1063 break;
1064 }
1065 ks->threadid = 0;
1066 ptr = remcom_in_buffer + 17;
1067 kgdb_hex2long(&ptr, &ks->threadid);
1068 if (!getthread(ks->linux_regs, ks->threadid)) {
1069 error_packet(remcom_out_buffer, -EINVAL);
1070 break;
1071 }
1072 if ((int)ks->threadid > 0) {
1073 kgdb_mem2hex(getthread(ks->linux_regs,
1074 ks->threadid)->comm,
1075 remcom_out_buffer, 16);
1076 } else {
1077 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1078
1079 sprintf(tmpstr, "shadowCPU%d",
1080 (int)(-ks->threadid - 2));
1081 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1082 }
1083 break;
1084 }
1085}
1086
1087/* Handle the 'H' task query packets */
1088static void gdb_cmd_task(struct kgdb_state *ks)
1089{
1090 struct task_struct *thread;
1091 char *ptr;
1092
1093 switch (remcom_in_buffer[1]) {
1094 case 'g':
1095 ptr = &remcom_in_buffer[2];
1096 kgdb_hex2long(&ptr, &ks->threadid);
1097 thread = getthread(ks->linux_regs, ks->threadid);
1098 if (!thread && ks->threadid > 0) {
1099 error_packet(remcom_out_buffer, -EINVAL);
1100 break;
1101 }
1102 kgdb_usethread = thread;
1103 ks->kgdb_usethreadid = ks->threadid;
1104 strcpy(remcom_out_buffer, "OK");
1105 break;
1106 case 'c':
1107 ptr = &remcom_in_buffer[2];
1108 kgdb_hex2long(&ptr, &ks->threadid);
1109 if (!ks->threadid) {
1110 kgdb_contthread = NULL;
1111 } else {
1112 thread = getthread(ks->linux_regs, ks->threadid);
1113 if (!thread && ks->threadid > 0) {
1114 error_packet(remcom_out_buffer, -EINVAL);
1115 break;
1116 }
1117 kgdb_contthread = thread;
1118 }
1119 strcpy(remcom_out_buffer, "OK");
1120 break;
1121 }
1122}
1123
1124/* Handle the 'T' thread query packets */
1125static void gdb_cmd_thread(struct kgdb_state *ks)
1126{
1127 char *ptr = &remcom_in_buffer[1];
1128 struct task_struct *thread;
1129
1130 kgdb_hex2long(&ptr, &ks->threadid);
1131 thread = getthread(ks->linux_regs, ks->threadid);
1132 if (thread)
1133 strcpy(remcom_out_buffer, "OK");
1134 else
1135 error_packet(remcom_out_buffer, -EINVAL);
1136}
1137
1138/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1139static void gdb_cmd_break(struct kgdb_state *ks)
1140{
1141 /*
1142 * Since GDB-5.3, it's been drafted that '0' is a software
1143 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1144 */
1145 char *bpt_type = &remcom_in_buffer[1];
1146 char *ptr = &remcom_in_buffer[2];
1147 unsigned long addr;
1148 unsigned long length;
1149 int error = 0;
1150
1151 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1152 /* Unsupported */
1153 if (*bpt_type > '4')
1154 return;
1155 } else {
1156 if (*bpt_type != '0' && *bpt_type != '1')
1157 /* Unsupported. */
1158 return;
1159 }
1160
1161 /*
1162 * Test if this is a hardware breakpoint, and
1163 * if we support it:
1164 */
1165 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1166 /* Unsupported. */
1167 return;
1168
1169 if (*(ptr++) != ',') {
1170 error_packet(remcom_out_buffer, -EINVAL);
1171 return;
1172 }
1173 if (!kgdb_hex2long(&ptr, &addr)) {
1174 error_packet(remcom_out_buffer, -EINVAL);
1175 return;
1176 }
1177 if (*(ptr++) != ',' ||
1178 !kgdb_hex2long(&ptr, &length)) {
1179 error_packet(remcom_out_buffer, -EINVAL);
1180 return;
1181 }
1182
1183 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1184 error = kgdb_set_sw_break(addr);
1185 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1186 error = kgdb_remove_sw_break(addr);
1187 else if (remcom_in_buffer[0] == 'Z')
1188 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1189 (int)length, *bpt_type - '0');
1190 else if (remcom_in_buffer[0] == 'z')
1191 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1192 (int) length, *bpt_type - '0');
1193
1194 if (error == 0)
1195 strcpy(remcom_out_buffer, "OK");
1196 else
1197 error_packet(remcom_out_buffer, error);
1198}
1199
1200/* Handle the 'C' signal / exception passing packets */
1201static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1202{
1203 /* C09 == pass exception
1204 * C15 == detach kgdb, pass exception
1205 */
1206 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1207
1208 ks->pass_exception = 1;
1209 remcom_in_buffer[0] = 'c';
1210
1211 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1212
1213 ks->pass_exception = 1;
1214 remcom_in_buffer[0] = 'D';
1215 remove_all_break();
1216 kgdb_connected = 0;
1217 return 1;
1218
1219 } else {
1220 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1221 " and 15 (pass and disconnect)\n"
1222 "Executing a continue without signal passing\n", 0);
1223 remcom_in_buffer[0] = 'c';
1224 }
1225
1226 /* Indicate fall through */
1227 return -1;
1228}
1229
1230/*
1231 * This function performs all gdbserial command procesing
1232 */
1233static int gdb_serial_stub(struct kgdb_state *ks)
1234{
1235 int error = 0;
1236 int tmp;
1237
1238 /* Clear the out buffer. */
1239 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1240
1241 if (kgdb_connected) {
1242 unsigned char thref[8];
1243 char *ptr;
1244
1245 /* Reply to host that an exception has occurred */
1246 ptr = remcom_out_buffer;
1247 *ptr++ = 'T';
1248 ptr = pack_hex_byte(ptr, ks->signo);
1249 ptr += strlen(strcpy(ptr, "thread:"));
1250 int_to_threadref(thref, shadow_pid(current->pid));
1251 ptr = pack_threadid(ptr, thref);
1252 *ptr++ = ';';
1253 put_packet(remcom_out_buffer);
1254 }
1255
1256 kgdb_usethread = kgdb_info[ks->cpu].task;
1257 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1258 ks->pass_exception = 0;
1259
1260 while (1) {
1261 error = 0;
1262
1263 /* Clear the out buffer. */
1264 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1265
1266 get_packet(remcom_in_buffer);
1267
1268 switch (remcom_in_buffer[0]) {
1269 case '?': /* gdbserial status */
1270 gdb_cmd_status(ks);
1271 break;
1272 case 'g': /* return the value of the CPU registers */
1273 gdb_cmd_getregs(ks);
1274 break;
1275 case 'G': /* set the value of the CPU registers - return OK */
1276 gdb_cmd_setregs(ks);
1277 break;
1278 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1279 gdb_cmd_memread(ks);
1280 break;
1281 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1282 gdb_cmd_memwrite(ks);
1283 break;
1284 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1285 gdb_cmd_binwrite(ks);
1286 break;
1287 /* kill or detach. KGDB should treat this like a
1288 * continue.
1289 */
1290 case 'D': /* Debugger detach */
1291 case 'k': /* Debugger detach via kill */
1292 gdb_cmd_detachkill(ks);
1293 goto default_handle;
1294 case 'R': /* Reboot */
1295 if (gdb_cmd_reboot(ks))
1296 goto default_handle;
1297 break;
1298 case 'q': /* query command */
1299 gdb_cmd_query(ks);
1300 break;
1301 case 'H': /* task related */
1302 gdb_cmd_task(ks);
1303 break;
1304 case 'T': /* Query thread status */
1305 gdb_cmd_thread(ks);
1306 break;
1307 case 'z': /* Break point remove */
1308 case 'Z': /* Break point set */
1309 gdb_cmd_break(ks);
1310 break;
1311 case 'C': /* Exception passing */
1312 tmp = gdb_cmd_exception_pass(ks);
1313 if (tmp > 0)
1314 goto default_handle;
1315 if (tmp == 0)
1316 break;
1317 /* Fall through on tmp < 0 */
1318 case 'c': /* Continue packet */
1319 case 's': /* Single step packet */
1320 if (kgdb_contthread && kgdb_contthread != current) {
1321 /* Can't switch threads in kgdb */
1322 error_packet(remcom_out_buffer, -EINVAL);
1323 break;
1324 }
1325 kgdb_activate_sw_breakpoints();
1326 /* Fall through to default processing */
1327 default:
1328default_handle:
1329 error = kgdb_arch_handle_exception(ks->ex_vector,
1330 ks->signo,
1331 ks->err_code,
1332 remcom_in_buffer,
1333 remcom_out_buffer,
1334 ks->linux_regs);
1335 /*
1336 * Leave cmd processing on error, detach,
1337 * kill, continue, or single step.
1338 */
1339 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1340 remcom_in_buffer[0] == 'k') {
1341 error = 0;
1342 goto kgdb_exit;
1343 }
1344
1345 }
1346
1347 /* reply to the request */
1348 put_packet(remcom_out_buffer);
1349 }
1350
1351kgdb_exit:
1352 if (ks->pass_exception)
1353 error = 1;
1354 return error;
1355}
1356
1357static int kgdb_reenter_check(struct kgdb_state *ks)
1358{
1359 unsigned long addr;
1360
1361 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1362 return 0;
1363
1364 /* Panic on recursive debugger calls: */
1365 exception_level++;
1366 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1367 kgdb_deactivate_sw_breakpoints();
1368
1369 /*
1370 * If the break point removed ok at the place exception
1371 * occurred, try to recover and print a warning to the end
1372 * user because the user planted a breakpoint in a place that
1373 * KGDB needs in order to function.
1374 */
1375 if (kgdb_remove_sw_break(addr) == 0) {
1376 exception_level = 0;
1377 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1378 kgdb_activate_sw_breakpoints();
1379 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1380 addr);
1381 WARN_ON_ONCE(1);
1382
1383 return 1;
1384 }
1385 remove_all_break();
1386 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1387
1388 if (exception_level > 1) {
1389 dump_stack();
1390 panic("Recursive entry to debugger");
1391 }
1392
1393 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1394 dump_stack();
1395 panic("Recursive entry to debugger");
1396
1397 return 1;
1398}
1399
1400/*
1401 * kgdb_handle_exception() - main entry point from a kernel exception
1402 *
1403 * Locking hierarchy:
1404 * interface locks, if any (begin_session)
1405 * kgdb lock (kgdb_active)
1406 */
1407int
1408kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1409{
1410 struct kgdb_state kgdb_var;
1411 struct kgdb_state *ks = &kgdb_var;
1412 unsigned long flags;
1413 int sstep_tries = 100;
1414 int error = 0;
1415 int i, cpu;
1416
1417 ks->cpu = raw_smp_processor_id();
1418 ks->ex_vector = evector;
1419 ks->signo = signo;
1420 ks->ex_vector = evector;
1421 ks->err_code = ecode;
1422 ks->kgdb_usethreadid = 0;
1423 ks->linux_regs = regs;
1424
1425 if (kgdb_reenter_check(ks))
1426 return 0; /* Ouch, double exception ! */
1427
1428acquirelock:
1429 /*
1430 * Interrupts will be restored by the 'trap return' code, except when
1431 * single stepping.
1432 */
1433 local_irq_save(flags);
1434
1435 cpu = raw_smp_processor_id();
1436
1437 /*
1438 * Acquire the kgdb_active lock:
1439 */
1440 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
1441 cpu_relax();
1442
1443 /*
1444 * For single stepping, try to only enter on the processor
1445 * that was single stepping. To gaurd against a deadlock, the
1446 * kernel will only try for the value of sstep_tries before
1447 * giving up and continuing on.
1448 */
1449 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1450 (kgdb_info[cpu].task &&
1451 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1452 atomic_set(&kgdb_active, -1);
1453 touch_softlockup_watchdog();
1454 clocksource_touch_watchdog();
1455 local_irq_restore(flags);
1456
1457 goto acquirelock;
1458 }
1459
1460 if (!kgdb_io_ready(1)) {
1461 error = 1;
1462 goto kgdb_restore; /* No I/O connection, so resume the system */
1463 }
1464
1465 /*
1466 * Don't enter if we have hit a removed breakpoint.
1467 */
1468 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1469 goto kgdb_restore;
1470
1471 /* Call the I/O driver's pre_exception routine */
1472 if (kgdb_io_ops->pre_exception)
1473 kgdb_io_ops->pre_exception();
1474
1475 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1476 kgdb_info[ks->cpu].task = current;
1477
1478 kgdb_disable_hw_debug(ks->linux_regs);
1479
1480 /*
1481 * Get the passive CPU lock which will hold all the non-primary
1482 * CPU in a spin state while the debugger is active
1483 */
1484 if (!kgdb_single_step) {
1485 for (i = 0; i < NR_CPUS; i++)
1486 atomic_set(&passive_cpu_wait[i], 1);
1487 }
1488
1489 /*
1490 * spin_lock code is good enough as a barrier so we don't
1491 * need one here:
1492 */
1493 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1494
1495#ifdef CONFIG_SMP
1496 /* Signal the other CPUs to enter kgdb_wait() */
1497 if ((!kgdb_single_step) && kgdb_do_roundup)
1498 kgdb_roundup_cpus(flags);
1499#endif
1500
1501 /*
1502 * Wait for the other CPUs to be notified and be waiting for us:
1503 */
1504 for_each_online_cpu(i) {
1505 while (!atomic_read(&cpu_in_kgdb[i]))
1506 cpu_relax();
1507 }
1508
1509 /*
1510 * At this point the primary processor is completely
1511 * in the debugger and all secondary CPUs are quiescent
1512 */
1513 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1514 kgdb_deactivate_sw_breakpoints();
1515 kgdb_single_step = 0;
1516 kgdb_contthread = current;
1517 exception_level = 0;
1518
1519 /* Talk to debugger with gdbserial protocol */
1520 error = gdb_serial_stub(ks);
1521
1522 /* Call the I/O driver's post_exception routine */
1523 if (kgdb_io_ops->post_exception)
1524 kgdb_io_ops->post_exception();
1525
1526 kgdb_info[ks->cpu].debuggerinfo = NULL;
1527 kgdb_info[ks->cpu].task = NULL;
1528 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1529
1530 if (!kgdb_single_step) {
1531 for (i = NR_CPUS-1; i >= 0; i--)
1532 atomic_set(&passive_cpu_wait[i], 0);
1533 /*
1534 * Wait till all the CPUs have quit
1535 * from the debugger.
1536 */
1537 for_each_online_cpu(i) {
1538 while (atomic_read(&cpu_in_kgdb[i]))
1539 cpu_relax();
1540 }
1541 }
1542
1543kgdb_restore:
1544 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1545 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1546 if (kgdb_info[sstep_cpu].task)
1547 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1548 else
1549 kgdb_sstep_pid = 0;
1550 }
1551 /* Free kgdb_active */
1552 atomic_set(&kgdb_active, -1);
1553 touch_softlockup_watchdog();
1554 clocksource_touch_watchdog();
1555 local_irq_restore(flags);
1556
1557 return error;
1558}
1559
1560int kgdb_nmicallback(int cpu, void *regs)
1561{
1562#ifdef CONFIG_SMP
1563 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1564 atomic_read(&kgdb_active) != cpu &&
1565 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
1566 kgdb_wait((struct pt_regs *)regs);
1567 return 0;
1568 }
1569#endif
1570 return 1;
1571}
1572
1573static void kgdb_console_write(struct console *co, const char *s,
1574 unsigned count)
1575{
1576 unsigned long flags;
1577
1578 /* If we're debugging, or KGDB has not connected, don't try
1579 * and print. */
1580 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1581 return;
1582
1583 local_irq_save(flags);
1584 kgdb_msg_write(s, count);
1585 local_irq_restore(flags);
1586}
1587
1588static struct console kgdbcons = {
1589 .name = "kgdb",
1590 .write = kgdb_console_write,
1591 .flags = CON_PRINTBUFFER | CON_ENABLED,
1592 .index = -1,
1593};
1594
1595#ifdef CONFIG_MAGIC_SYSRQ
1596static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1597{
1598 if (!kgdb_io_ops) {
1599 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1600 return;
1601 }
1602 if (!kgdb_connected)
1603 printk(KERN_CRIT "Entering KGDB\n");
1604
1605 kgdb_breakpoint();
1606}
1607
1608static struct sysrq_key_op sysrq_gdb_op = {
1609 .handler = sysrq_handle_gdb,
1610 .help_msg = "debug(G)",
1611 .action_msg = "DEBUG",
1612};
1613#endif
1614
1615static void kgdb_register_callbacks(void)
1616{
1617 if (!kgdb_io_module_registered) {
1618 kgdb_io_module_registered = 1;
1619 kgdb_arch_init();
1620#ifdef CONFIG_MAGIC_SYSRQ
1621 register_sysrq_key('g', &sysrq_gdb_op);
1622#endif
1623 if (kgdb_use_con && !kgdb_con_registered) {
1624 register_console(&kgdbcons);
1625 kgdb_con_registered = 1;
1626 }
1627 }
1628}
1629
1630static void kgdb_unregister_callbacks(void)
1631{
1632 /*
1633 * When this routine is called KGDB should unregister from the
1634 * panic handler and clean up, making sure it is not handling any
1635 * break exceptions at the time.
1636 */
1637 if (kgdb_io_module_registered) {
1638 kgdb_io_module_registered = 0;
1639 kgdb_arch_exit();
1640#ifdef CONFIG_MAGIC_SYSRQ
1641 unregister_sysrq_key('g', &sysrq_gdb_op);
1642#endif
1643 if (kgdb_con_registered) {
1644 unregister_console(&kgdbcons);
1645 kgdb_con_registered = 0;
1646 }
1647 }
1648}
1649
1650static void kgdb_initial_breakpoint(void)
1651{
1652 kgdb_break_asap = 0;
1653
1654 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1655 kgdb_breakpoint();
1656}
1657
1658/**
1659 * kgdb_register_io_module - register KGDB IO module
1660 * @new_kgdb_io_ops: the io ops vector
1661 *
1662 * Register it with the KGDB core.
1663 */
1664int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1665{
1666 int err;
1667
1668 spin_lock(&kgdb_registration_lock);
1669
1670 if (kgdb_io_ops) {
1671 spin_unlock(&kgdb_registration_lock);
1672
1673 printk(KERN_ERR "kgdb: Another I/O driver is already "
1674 "registered with KGDB.\n");
1675 return -EBUSY;
1676 }
1677
1678 if (new_kgdb_io_ops->init) {
1679 err = new_kgdb_io_ops->init();
1680 if (err) {
1681 spin_unlock(&kgdb_registration_lock);
1682 return err;
1683 }
1684 }
1685
1686 kgdb_io_ops = new_kgdb_io_ops;
1687
1688 spin_unlock(&kgdb_registration_lock);
1689
1690 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1691 new_kgdb_io_ops->name);
1692
1693 /* Arm KGDB now. */
1694 kgdb_register_callbacks();
1695
1696 if (kgdb_break_asap)
1697 kgdb_initial_breakpoint();
1698
1699 return 0;
1700}
1701EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1702
1703/**
1704 * kkgdb_unregister_io_module - unregister KGDB IO module
1705 * @old_kgdb_io_ops: the io ops vector
1706 *
1707 * Unregister it with the KGDB core.
1708 */
1709void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1710{
1711 BUG_ON(kgdb_connected);
1712
1713 /*
1714 * KGDB is no longer able to communicate out, so
1715 * unregister our callbacks and reset state.
1716 */
1717 kgdb_unregister_callbacks();
1718
1719 spin_lock(&kgdb_registration_lock);
1720
1721 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1722 kgdb_io_ops = NULL;
1723
1724 spin_unlock(&kgdb_registration_lock);
1725
1726 printk(KERN_INFO
1727 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1728 old_kgdb_io_ops->name);
1729}
1730EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1731
1732/**
1733 * kgdb_breakpoint - generate breakpoint exception
1734 *
1735 * This function will generate a breakpoint exception. It is used at the
1736 * beginning of a program to sync up with a debugger and can be used
1737 * otherwise as a quick means to stop program execution and "break" into
1738 * the debugger.
1739 */
1740void kgdb_breakpoint(void)
1741{
1742 atomic_set(&kgdb_setting_breakpoint, 1);
1743 wmb(); /* Sync point before breakpoint */
1744 arch_kgdb_breakpoint();
1745 wmb(); /* Sync point after breakpoint */
1746 atomic_set(&kgdb_setting_breakpoint, 0);
1747}
1748EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1749
1750static int __init opt_kgdb_wait(char *str)
1751{
1752 kgdb_break_asap = 1;
1753
1754 if (kgdb_io_module_registered)
1755 kgdb_initial_breakpoint();
1756
1757 return 0;
1758}
1759
1760early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
116 116
117 trace_module_request(module_name, wait, _RET_IP_); 117 trace_module_request(module_name, wait, _RET_IP_);
118 118
119 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
121 NULL, NULL, NULL);
122
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return ret; 124 return ret;
123} 125}
124EXPORT_SYMBOL(__request_module); 126EXPORT_SYMBOL(__request_module);
125#endif /* CONFIG_MODULES */ 127#endif /* CONFIG_MODULES */
126 128
127struct subprocess_info {
128 struct work_struct work;
129 struct completion *complete;
130 struct cred *cred;
131 char *path;
132 char **argv;
133 char **envp;
134 enum umh_wait wait;
135 int retval;
136 struct file *stdin;
137 void (*cleanup)(char **argv, char **envp);
138};
139
140/* 129/*
141 * This is the task which runs the usermode application 130 * This is the task which runs the usermode application
142 */ 131 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
145 struct subprocess_info *sub_info = data; 134 struct subprocess_info *sub_info = data;
146 int retval; 135 int retval;
147 136
148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
149
150 /* Unblock all signals */
151 spin_lock_irq(&current->sighand->siglock); 137 spin_lock_irq(&current->sighand->siglock);
152 flush_signal_handlers(current, 1); 138 flush_signal_handlers(current, 1);
153 sigemptyset(&current->blocked);
154 recalc_sigpending();
155 spin_unlock_irq(&current->sighand->siglock); 139 spin_unlock_irq(&current->sighand->siglock);
156 140
157 /* Install the credentials */
158 commit_creds(sub_info->cred);
159 sub_info->cred = NULL;
160
161 /* Install input pipe when needed */
162 if (sub_info->stdin) {
163 struct files_struct *f = current->files;
164 struct fdtable *fdt;
165 /* no races because files should be private here */
166 sys_close(0);
167 fd_install(0, sub_info->stdin);
168 spin_lock(&f->file_lock);
169 fdt = files_fdtable(f);
170 FD_SET(0, fdt->open_fds);
171 FD_CLR(0, fdt->close_on_exec);
172 spin_unlock(&f->file_lock);
173
174 /* and disallow core files too */
175 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
176 }
177
178 /* We can run anywhere, unlike our parent keventd(). */ 141 /* We can run anywhere, unlike our parent keventd(). */
179 set_cpus_allowed_ptr(current, cpu_all_mask); 142 set_cpus_allowed_ptr(current, cpu_all_mask);
180 143
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
184 */ 147 */
185 set_user_nice(current, 0); 148 set_user_nice(current, 0);
186 149
150 if (sub_info->init) {
151 retval = sub_info->init(sub_info);
152 if (retval)
153 goto fail;
154 }
155
187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
188 157
189 /* Exec failed? */ 158 /* Exec failed? */
159fail:
190 sub_info->retval = retval; 160 sub_info->retval = retval;
191 do_exit(0); 161 do_exit(0);
192} 162}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 164void call_usermodehelper_freeinfo(struct subprocess_info *info)
195{ 165{
196 if (info->cleanup) 166 if (info->cleanup)
197 (*info->cleanup)(info->argv, info->envp); 167 (*info->cleanup)(info);
198 if (info->cred)
199 put_cred(info->cred);
200 kfree(info); 168 kfree(info);
201} 169}
202EXPORT_SYMBOL(call_usermodehelper_freeinfo); 170EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
207 struct subprocess_info *sub_info = data; 175 struct subprocess_info *sub_info = data;
208 pid_t pid; 176 pid_t pid;
209 177
210 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 178 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
211 * populate the status, but will return -ECHILD. */ 179 spin_lock_irq(&current->sighand->siglock);
212 allow_signal(SIGCHLD); 180 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
181 spin_unlock_irq(&current->sighand->siglock);
213 182
214 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 183 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
215 if (pid < 0) { 184 if (pid < 0) {
216 sub_info->retval = pid; 185 sub_info->retval = pid;
217 } else { 186 } else {
218 int ret; 187 int ret = -ECHILD;
219
220 /* 188 /*
221 * Normally it is bogus to call wait4() from in-kernel because 189 * Normally it is bogus to call wait4() from in-kernel because
222 * wait4() wants to write the exit code to a userspace address. 190 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
237 sub_info->retval = ret; 205 sub_info->retval = ret;
238 } 206 }
239 207
240 if (sub_info->wait == UMH_NO_WAIT) 208 complete(sub_info->complete);
241 call_usermodehelper_freeinfo(sub_info);
242 else
243 complete(sub_info->complete);
244 return 0; 209 return 0;
245} 210}
246 211
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
249{ 214{
250 struct subprocess_info *sub_info = 215 struct subprocess_info *sub_info =
251 container_of(work, struct subprocess_info, work); 216 container_of(work, struct subprocess_info, work);
252 pid_t pid;
253 enum umh_wait wait = sub_info->wait; 217 enum umh_wait wait = sub_info->wait;
254 218 pid_t pid;
255 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
256 219
257 /* CLONE_VFORK: wait until the usermode helper has execve'd 220 /* CLONE_VFORK: wait until the usermode helper has execve'd
258 * successfully We need the data structures to stay around 221 * successfully We need the data structures to stay around
259 * until that is done. */ 222 * until that is done. */
260 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) 223 if (wait == UMH_WAIT_PROC)
261 pid = kernel_thread(wait_for_helper, sub_info, 224 pid = kernel_thread(wait_for_helper, sub_info,
262 CLONE_FS | CLONE_FILES | SIGCHLD); 225 CLONE_FS | CLONE_FILES | SIGCHLD);
263 else 226 else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
266 229
267 switch (wait) { 230 switch (wait) {
268 case UMH_NO_WAIT: 231 case UMH_NO_WAIT:
232 call_usermodehelper_freeinfo(sub_info);
269 break; 233 break;
270 234
271 case UMH_WAIT_PROC: 235 case UMH_WAIT_PROC:
272 if (pid > 0) 236 if (pid > 0)
273 break; 237 break;
274 sub_info->retval = pid;
275 /* FALLTHROUGH */ 238 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC: 239 case UMH_WAIT_EXEC:
240 if (pid < 0)
241 sub_info->retval = pid;
278 complete(sub_info->complete); 242 complete(sub_info->complete);
279 } 243 }
280} 244}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
376 sub_info->path = path; 340 sub_info->path = path;
377 sub_info->argv = argv; 341 sub_info->argv = argv;
378 sub_info->envp = envp; 342 sub_info->envp = envp;
379 sub_info->cred = prepare_usermodehelper_creds();
380 if (!sub_info->cred) {
381 kfree(sub_info);
382 return NULL;
383 }
384
385 out: 343 out:
386 return sub_info; 344 return sub_info;
387} 345}
388EXPORT_SYMBOL(call_usermodehelper_setup); 346EXPORT_SYMBOL(call_usermodehelper_setup);
389 347
390/** 348/**
391 * call_usermodehelper_setkeys - set the session keys for usermode helper 349 * call_usermodehelper_setfns - set a cleanup/init function
392 * @info: a subprocess_info returned by call_usermodehelper_setup
393 * @session_keyring: the session keyring for the process
394 */
395void call_usermodehelper_setkeys(struct subprocess_info *info,
396 struct key *session_keyring)
397{
398#ifdef CONFIG_KEYS
399 struct thread_group_cred *tgcred = info->cred->tgcred;
400 key_put(tgcred->session_keyring);
401 tgcred->session_keyring = key_get(session_keyring);
402#else
403 BUG();
404#endif
405}
406EXPORT_SYMBOL(call_usermodehelper_setkeys);
407
408/**
409 * call_usermodehelper_setcleanup - set a cleanup function
410 * @info: a subprocess_info returned by call_usermodehelper_setup 350 * @info: a subprocess_info returned by call_usermodehelper_setup
411 * @cleanup: a cleanup function 351 * @cleanup: a cleanup function
352 * @init: an init function
353 * @data: arbitrary context sensitive data
412 * 354 *
413 * The cleanup function is just befor ethe subprocess_info is about to 355 * The init function is used to customize the helper process prior to
356 * exec. A non-zero return code causes the process to error out, exit,
357 * and return the failure to the calling process
358 *
359 * The cleanup function is just before ethe subprocess_info is about to
414 * be freed. This can be used for freeing the argv and envp. The 360 * be freed. This can be used for freeing the argv and envp. The
415 * Function must be runnable in either a process context or the 361 * Function must be runnable in either a process context or the
416 * context in which call_usermodehelper_exec is called. 362 * context in which call_usermodehelper_exec is called.
417 */ 363 */
418void call_usermodehelper_setcleanup(struct subprocess_info *info, 364void call_usermodehelper_setfns(struct subprocess_info *info,
419 void (*cleanup)(char **argv, char **envp)) 365 int (*init)(struct subprocess_info *info),
366 void (*cleanup)(struct subprocess_info *info),
367 void *data)
420{ 368{
421 info->cleanup = cleanup; 369 info->cleanup = cleanup;
370 info->init = init;
371 info->data = data;
422} 372}
423EXPORT_SYMBOL(call_usermodehelper_setcleanup); 373EXPORT_SYMBOL(call_usermodehelper_setfns);
424
425/**
426 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
427 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
428 * @filp: set to the write-end of a pipe
429 *
430 * This constructs a pipe, and sets the read end to be the stdin of the
431 * subprocess, and returns the write-end in *@filp.
432 */
433int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
434 struct file **filp)
435{
436 struct file *f;
437
438 f = create_write_pipe(0);
439 if (IS_ERR(f))
440 return PTR_ERR(f);
441 *filp = f;
442
443 f = create_read_pipe(f, 0);
444 if (IS_ERR(f)) {
445 free_write_pipe(*filp);
446 return PTR_ERR(f);
447 }
448 sub_info->stdin = f;
449
450 return 0;
451}
452EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
453 374
454/** 375/**
455 * call_usermodehelper_exec - start a usermode application 376 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
469 DECLARE_COMPLETION_ONSTACK(done); 390 DECLARE_COMPLETION_ONSTACK(done);
470 int retval = 0; 391 int retval = 0;
471 392
472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
474
475 helper_lock(); 393 helper_lock();
476 if (sub_info->path[0] == '\0') 394 if (sub_info->path[0] == '\0')
477 goto out; 395 goto out;
@@ -498,41 +416,6 @@ unlock:
498} 416}
499EXPORT_SYMBOL(call_usermodehelper_exec); 417EXPORT_SYMBOL(call_usermodehelper_exec);
500 418
501/**
502 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
503 * @path: path to usermode executable
504 * @argv: arg vector for process
505 * @envp: environment for process
506 * @filp: set to the write-end of a pipe
507 *
508 * This is a simple wrapper which executes a usermode-helper function
509 * with a pipe as stdin. It is implemented entirely in terms of
510 * lower-level call_usermodehelper_* functions.
511 */
512int call_usermodehelper_pipe(char *path, char **argv, char **envp,
513 struct file **filp)
514{
515 struct subprocess_info *sub_info;
516 int ret;
517
518 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
519 if (sub_info == NULL)
520 return -ENOMEM;
521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) {
524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
527
528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
531
532 return ret;
533}
534EXPORT_SYMBOL(call_usermodehelper_pipe);
535
536void __init usermodehelper_init(void) 419void __init usermodehelper_init(void)
537{ 420{
538 khelper_wq = create_singlethread_workqueue("khelper"); 421 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,8 +42,11 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/sysctl.h>
45#include <linux/kdebug.h> 46#include <linux/kdebug.h>
46#include <linux/memory.h> 47#include <linux/memory.h>
48#include <linux/ftrace.h>
49#include <linux/cpu.h>
47 50
48#include <asm-generic/sections.h> 51#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 52#include <asm/cacheflush.h>
@@ -93,6 +96,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 96 {"native_get_debugreg",},
94 {"irq_entries_start",}, 97 {"irq_entries_start",},
95 {"common_interrupt",}, 98 {"common_interrupt",},
99 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 100 {NULL} /* Terminator */
97}; 101};
98 102
@@ -103,81 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103 * stepping on the instruction on a vmalloced/kmalloced/data page 107 * stepping on the instruction on a vmalloced/kmalloced/data page
104 * is a recipe for disaster 108 * is a recipe for disaster
105 */ 109 */
106#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
107
108struct kprobe_insn_page { 110struct kprobe_insn_page {
109 struct list_head list; 111 struct list_head list;
110 kprobe_opcode_t *insns; /* Page of instruction slots */ 112 kprobe_opcode_t *insns; /* Page of instruction slots */
111 char slot_used[INSNS_PER_PAGE];
112 int nused; 113 int nused;
113 int ngarbage; 114 int ngarbage;
115 char slot_used[];
116};
117
118#define KPROBE_INSN_PAGE_SIZE(slots) \
119 (offsetof(struct kprobe_insn_page, slot_used) + \
120 (sizeof(char) * (slots)))
121
122struct kprobe_insn_cache {
123 struct list_head pages; /* list of kprobe_insn_page */
124 size_t insn_size; /* size of instruction slot */
125 int nr_garbage;
114}; 126};
115 127
128static int slots_per_page(struct kprobe_insn_cache *c)
129{
130 return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
131}
132
116enum kprobe_slot_state { 133enum kprobe_slot_state {
117 SLOT_CLEAN = 0, 134 SLOT_CLEAN = 0,
118 SLOT_DIRTY = 1, 135 SLOT_DIRTY = 1,
119 SLOT_USED = 2, 136 SLOT_USED = 2,
120}; 137};
121 138
122static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 139static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
123static LIST_HEAD(kprobe_insn_pages); 140static struct kprobe_insn_cache kprobe_insn_slots = {
124static int kprobe_garbage_slots; 141 .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
125static int collect_garbage_slots(void); 142 .insn_size = MAX_INSN_SIZE,
126 143 .nr_garbage = 0,
127static int __kprobes check_safety(void) 144};
128{ 145static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150 146
151/** 147/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
154 */ 150 */
155static kprobe_opcode_t __kprobes *__get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
156{ 152{
157 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
158 154
159 retry: 155 retry:
160 list_for_each_entry(kip, &kprobe_insn_pages, list) { 156 list_for_each_entry(kip, &c->pages, list) {
161 if (kip->nused < INSNS_PER_PAGE) { 157 if (kip->nused < slots_per_page(c)) {
162 int i; 158 int i;
163 for (i = 0; i < INSNS_PER_PAGE; i++) { 159 for (i = 0; i < slots_per_page(c); i++) {
164 if (kip->slot_used[i] == SLOT_CLEAN) { 160 if (kip->slot_used[i] == SLOT_CLEAN) {
165 kip->slot_used[i] = SLOT_USED; 161 kip->slot_used[i] = SLOT_USED;
166 kip->nused++; 162 kip->nused++;
167 return kip->insns + (i * MAX_INSN_SIZE); 163 return kip->insns + (i * c->insn_size);
168 } 164 }
169 } 165 }
170 /* Surprise! No unused slots. Fix kip->nused. */ 166 /* kip->nused is broken. Fix it. */
171 kip->nused = INSNS_PER_PAGE; 167 kip->nused = slots_per_page(c);
168 WARN_ON(1);
172 } 169 }
173 } 170 }
174 171
175 /* If there are any garbage slots, collect it and try again. */ 172 /* If there are any garbage slots, collect it and try again. */
176 if (kprobe_garbage_slots && collect_garbage_slots() == 0) { 173 if (c->nr_garbage && collect_garbage_slots(c) == 0)
177 goto retry; 174 goto retry;
178 } 175
179 /* All out of space. Need to allocate a new page. Use slot 0. */ 176 /* All out of space. Need to allocate a new page. */
180 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); 177 kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
181 if (!kip) 178 if (!kip)
182 return NULL; 179 return NULL;
183 180
@@ -192,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
192 return NULL; 189 return NULL;
193 } 190 }
194 INIT_LIST_HEAD(&kip->list); 191 INIT_LIST_HEAD(&kip->list);
195 list_add(&kip->list, &kprobe_insn_pages); 192 memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
196 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
197 kip->slot_used[0] = SLOT_USED; 193 kip->slot_used[0] = SLOT_USED;
198 kip->nused = 1; 194 kip->nused = 1;
199 kip->ngarbage = 0; 195 kip->ngarbage = 0;
196 list_add(&kip->list, &c->pages);
200 return kip->insns; 197 return kip->insns;
201} 198}
202 199
200
203kprobe_opcode_t __kprobes *get_insn_slot(void) 201kprobe_opcode_t __kprobes *get_insn_slot(void)
204{ 202{
205 kprobe_opcode_t *ret; 203 kprobe_opcode_t *ret = NULL;
204
206 mutex_lock(&kprobe_insn_mutex); 205 mutex_lock(&kprobe_insn_mutex);
207 ret = __get_insn_slot(); 206 ret = __get_insn_slot(&kprobe_insn_slots);
208 mutex_unlock(&kprobe_insn_mutex); 207 mutex_unlock(&kprobe_insn_mutex);
208
209 return ret; 209 return ret;
210} 210}
211 211
@@ -221,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
221 * so as not to have to set it up again the 221 * so as not to have to set it up again the
222 * next time somebody inserts a probe. 222 * next time somebody inserts a probe.
223 */ 223 */
224 if (!list_is_singular(&kprobe_insn_pages)) { 224 if (!list_is_singular(&kip->list)) {
225 list_del(&kip->list); 225 list_del(&kip->list);
226 module_free(NULL, kip->insns); 226 module_free(NULL, kip->insns);
227 kfree(kip); 227 kfree(kip);
@@ -231,52 +231,85 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
231 return 0; 231 return 0;
232} 232}
233 233
234static int __kprobes collect_garbage_slots(void) 234static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
235{ 235{
236 struct kprobe_insn_page *kip, *next; 236 struct kprobe_insn_page *kip, *next;
237 237
238 /* Ensure no-one is preepmted on the garbages */ 238 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 239 synchronize_sched();
240 return -EAGAIN;
241 240
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 241 list_for_each_entry_safe(kip, next, &c->pages, list) {
243 int i; 242 int i;
244 if (kip->ngarbage == 0) 243 if (kip->ngarbage == 0)
245 continue; 244 continue;
246 kip->ngarbage = 0; /* we will collect all garbages */ 245 kip->ngarbage = 0; /* we will collect all garbages */
247 for (i = 0; i < INSNS_PER_PAGE; i++) { 246 for (i = 0; i < slots_per_page(c); i++) {
248 if (kip->slot_used[i] == SLOT_DIRTY && 247 if (kip->slot_used[i] == SLOT_DIRTY &&
249 collect_one_slot(kip, i)) 248 collect_one_slot(kip, i))
250 break; 249 break;
251 } 250 }
252 } 251 }
253 kprobe_garbage_slots = 0; 252 c->nr_garbage = 0;
254 return 0; 253 return 0;
255} 254}
256 255
257void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 256static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
257 kprobe_opcode_t *slot, int dirty)
258{ 258{
259 struct kprobe_insn_page *kip; 259 struct kprobe_insn_page *kip;
260 260
261 mutex_lock(&kprobe_insn_mutex); 261 list_for_each_entry(kip, &c->pages, list) {
262 list_for_each_entry(kip, &kprobe_insn_pages, list) { 262 long idx = ((long)slot - (long)kip->insns) /
263 if (kip->insns <= slot && 263 (c->insn_size * sizeof(kprobe_opcode_t));
264 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 264 if (idx >= 0 && idx < slots_per_page(c)) {
265 int i = (slot - kip->insns) / MAX_INSN_SIZE; 265 WARN_ON(kip->slot_used[idx] != SLOT_USED);
266 if (dirty) { 266 if (dirty) {
267 kip->slot_used[i] = SLOT_DIRTY; 267 kip->slot_used[idx] = SLOT_DIRTY;
268 kip->ngarbage++; 268 kip->ngarbage++;
269 if (++c->nr_garbage > slots_per_page(c))
270 collect_garbage_slots(c);
269 } else 271 } else
270 collect_one_slot(kip, i); 272 collect_one_slot(kip, idx);
271 break; 273 return;
272 } 274 }
273 } 275 }
276 /* Could not free this slot. */
277 WARN_ON(1);
278}
274 279
275 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 280void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
276 collect_garbage_slots(); 281{
277 282 mutex_lock(&kprobe_insn_mutex);
283 __free_insn_slot(&kprobe_insn_slots, slot, dirty);
278 mutex_unlock(&kprobe_insn_mutex); 284 mutex_unlock(&kprobe_insn_mutex);
279} 285}
286#ifdef CONFIG_OPTPROBES
287/* For optimized_kprobe buffer */
288static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
289static struct kprobe_insn_cache kprobe_optinsn_slots = {
290 .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
291 /* .insn_size is initialized later */
292 .nr_garbage = 0,
293};
294/* Get a slot for optimized_kprobe buffer */
295kprobe_opcode_t __kprobes *get_optinsn_slot(void)
296{
297 kprobe_opcode_t *ret = NULL;
298
299 mutex_lock(&kprobe_optinsn_mutex);
300 ret = __get_insn_slot(&kprobe_optinsn_slots);
301 mutex_unlock(&kprobe_optinsn_mutex);
302
303 return ret;
304}
305
306void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
307{
308 mutex_lock(&kprobe_optinsn_mutex);
309 __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
310 mutex_unlock(&kprobe_optinsn_mutex);
311}
312#endif
280#endif 313#endif
281 314
282/* We have preemption disabled.. so it is safe to use __ versions */ 315/* We have preemption disabled.. so it is safe to use __ versions */
@@ -307,23 +340,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
307 if (p->addr == addr) 340 if (p->addr == addr)
308 return p; 341 return p;
309 } 342 }
343
310 return NULL; 344 return NULL;
311} 345}
312 346
347static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
348
349/* Return true if the kprobe is an aggregator */
350static inline int kprobe_aggrprobe(struct kprobe *p)
351{
352 return p->pre_handler == aggr_pre_handler;
353}
354
355/*
356 * Keep all fields in the kprobe consistent
357 */
358static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
359{
360 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
361 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
362}
363
364#ifdef CONFIG_OPTPROBES
365/* NOTE: change this value only with kprobe_mutex held */
366static bool kprobes_allow_optimization;
367
368/*
369 * Call all pre_handler on the list, but ignores its return value.
370 * This must be called from arch-dep optimized caller.
371 */
372void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
373{
374 struct kprobe *kp;
375
376 list_for_each_entry_rcu(kp, &p->list, list) {
377 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
378 set_kprobe_instance(kp);
379 kp->pre_handler(kp, regs);
380 }
381 reset_kprobe_instance();
382 }
383}
384
385/* Return true(!0) if the kprobe is ready for optimization. */
386static inline int kprobe_optready(struct kprobe *p)
387{
388 struct optimized_kprobe *op;
389
390 if (kprobe_aggrprobe(p)) {
391 op = container_of(p, struct optimized_kprobe, kp);
392 return arch_prepared_optinsn(&op->optinsn);
393 }
394
395 return 0;
396}
397
398/*
399 * Return an optimized kprobe whose optimizing code replaces
400 * instructions including addr (exclude breakpoint).
401 */
402struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
403{
404 int i;
405 struct kprobe *p = NULL;
406 struct optimized_kprobe *op;
407
408 /* Don't check i == 0, since that is a breakpoint case. */
409 for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
410 p = get_kprobe((void *)(addr - i));
411
412 if (p && kprobe_optready(p)) {
413 op = container_of(p, struct optimized_kprobe, kp);
414 if (arch_within_optimized_kprobe(op, addr))
415 return p;
416 }
417
418 return NULL;
419}
420
421/* Optimization staging list, protected by kprobe_mutex */
422static LIST_HEAD(optimizing_list);
423
424static void kprobe_optimizer(struct work_struct *work);
425static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
426#define OPTIMIZE_DELAY 5
427
428/* Kprobe jump optimizer */
429static __kprobes void kprobe_optimizer(struct work_struct *work)
430{
431 struct optimized_kprobe *op, *tmp;
432
433 /* Lock modules while optimizing kprobes */
434 mutex_lock(&module_mutex);
435 mutex_lock(&kprobe_mutex);
436 if (kprobes_all_disarmed || !kprobes_allow_optimization)
437 goto end;
438
439 /*
440 * Wait for quiesence period to ensure all running interrupts
441 * are done. Because optprobe may modify multiple instructions
442 * there is a chance that Nth instruction is interrupted. In that
443 * case, running interrupt can return to 2nd-Nth byte of jump
444 * instruction. This wait is for avoiding it.
445 */
446 synchronize_sched();
447
448 /*
449 * The optimization/unoptimization refers online_cpus via
450 * stop_machine() and cpu-hotplug modifies online_cpus.
451 * And same time, text_mutex will be held in cpu-hotplug and here.
452 * This combination can cause a deadlock (cpu-hotplug try to lock
453 * text_mutex but stop_machine can not be done because online_cpus
454 * has been changed)
455 * To avoid this deadlock, we need to call get_online_cpus()
456 * for preventing cpu-hotplug outside of text_mutex locking.
457 */
458 get_online_cpus();
459 mutex_lock(&text_mutex);
460 list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
461 WARN_ON(kprobe_disabled(&op->kp));
462 if (arch_optimize_kprobe(op) < 0)
463 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
464 list_del_init(&op->list);
465 }
466 mutex_unlock(&text_mutex);
467 put_online_cpus();
468end:
469 mutex_unlock(&kprobe_mutex);
470 mutex_unlock(&module_mutex);
471}
472
473/* Optimize kprobe if p is ready to be optimized */
474static __kprobes void optimize_kprobe(struct kprobe *p)
475{
476 struct optimized_kprobe *op;
477
478 /* Check if the kprobe is disabled or not ready for optimization. */
479 if (!kprobe_optready(p) || !kprobes_allow_optimization ||
480 (kprobe_disabled(p) || kprobes_all_disarmed))
481 return;
482
483 /* Both of break_handler and post_handler are not supported. */
484 if (p->break_handler || p->post_handler)
485 return;
486
487 op = container_of(p, struct optimized_kprobe, kp);
488
489 /* Check there is no other kprobes at the optimized instructions */
490 if (arch_check_optimized_kprobe(op) < 0)
491 return;
492
493 /* Check if it is already optimized. */
494 if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
495 return;
496
497 op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
498 list_add(&op->list, &optimizing_list);
499 if (!delayed_work_pending(&optimizing_work))
500 schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
501}
502
503/* Unoptimize a kprobe if p is optimized */
504static __kprobes void unoptimize_kprobe(struct kprobe *p)
505{
506 struct optimized_kprobe *op;
507
508 if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
509 op = container_of(p, struct optimized_kprobe, kp);
510 if (!list_empty(&op->list))
511 /* Dequeue from the optimization queue */
512 list_del_init(&op->list);
513 else
514 /* Replace jump with break */
515 arch_unoptimize_kprobe(op);
516 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
517 }
518}
519
520/* Remove optimized instructions */
521static void __kprobes kill_optimized_kprobe(struct kprobe *p)
522{
523 struct optimized_kprobe *op;
524
525 op = container_of(p, struct optimized_kprobe, kp);
526 if (!list_empty(&op->list)) {
527 /* Dequeue from the optimization queue */
528 list_del_init(&op->list);
529 op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
530 }
531 /* Don't unoptimize, because the target code will be freed. */
532 arch_remove_optimized_kprobe(op);
533}
534
535/* Try to prepare optimized instructions */
536static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
537{
538 struct optimized_kprobe *op;
539
540 op = container_of(p, struct optimized_kprobe, kp);
541 arch_prepare_optimized_kprobe(op);
542}
543
544/* Free optimized instructions and optimized_kprobe */
545static __kprobes void free_aggr_kprobe(struct kprobe *p)
546{
547 struct optimized_kprobe *op;
548
549 op = container_of(p, struct optimized_kprobe, kp);
550 arch_remove_optimized_kprobe(op);
551 kfree(op);
552}
553
554/* Allocate new optimized_kprobe and try to prepare optimized instructions */
555static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
556{
557 struct optimized_kprobe *op;
558
559 op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
560 if (!op)
561 return NULL;
562
563 INIT_LIST_HEAD(&op->list);
564 op->kp.addr = p->addr;
565 arch_prepare_optimized_kprobe(op);
566
567 return &op->kp;
568}
569
570static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
571
572/*
573 * Prepare an optimized_kprobe and optimize it
574 * NOTE: p must be a normal registered kprobe
575 */
576static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
577{
578 struct kprobe *ap;
579 struct optimized_kprobe *op;
580
581 ap = alloc_aggr_kprobe(p);
582 if (!ap)
583 return;
584
585 op = container_of(ap, struct optimized_kprobe, kp);
586 if (!arch_prepared_optinsn(&op->optinsn)) {
587 /* If failed to setup optimizing, fallback to kprobe */
588 free_aggr_kprobe(ap);
589 return;
590 }
591
592 init_aggr_kprobe(ap, p);
593 optimize_kprobe(ap);
594}
595
596#ifdef CONFIG_SYSCTL
597static void __kprobes optimize_all_kprobes(void)
598{
599 struct hlist_head *head;
600 struct hlist_node *node;
601 struct kprobe *p;
602 unsigned int i;
603
604 /* If optimization is already allowed, just return */
605 if (kprobes_allow_optimization)
606 return;
607
608 kprobes_allow_optimization = true;
609 mutex_lock(&text_mutex);
610 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
611 head = &kprobe_table[i];
612 hlist_for_each_entry_rcu(p, node, head, hlist)
613 if (!kprobe_disabled(p))
614 optimize_kprobe(p);
615 }
616 mutex_unlock(&text_mutex);
617 printk(KERN_INFO "Kprobes globally optimized\n");
618}
619
620static void __kprobes unoptimize_all_kprobes(void)
621{
622 struct hlist_head *head;
623 struct hlist_node *node;
624 struct kprobe *p;
625 unsigned int i;
626
627 /* If optimization is already prohibited, just return */
628 if (!kprobes_allow_optimization)
629 return;
630
631 kprobes_allow_optimization = false;
632 printk(KERN_INFO "Kprobes globally unoptimized\n");
633 get_online_cpus(); /* For avoiding text_mutex deadlock */
634 mutex_lock(&text_mutex);
635 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
636 head = &kprobe_table[i];
637 hlist_for_each_entry_rcu(p, node, head, hlist) {
638 if (!kprobe_disabled(p))
639 unoptimize_kprobe(p);
640 }
641 }
642
643 mutex_unlock(&text_mutex);
644 put_online_cpus();
645 /* Allow all currently running kprobes to complete */
646 synchronize_sched();
647}
648
649int sysctl_kprobes_optimization;
650int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
651 void __user *buffer, size_t *length,
652 loff_t *ppos)
653{
654 int ret;
655
656 mutex_lock(&kprobe_mutex);
657 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
658 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
659
660 if (sysctl_kprobes_optimization)
661 optimize_all_kprobes();
662 else
663 unoptimize_all_kprobes();
664 mutex_unlock(&kprobe_mutex);
665
666 return ret;
667}
668#endif /* CONFIG_SYSCTL */
669
670static void __kprobes __arm_kprobe(struct kprobe *p)
671{
672 struct kprobe *old_p;
673
674 /* Check collision with other optimized kprobes */
675 old_p = get_optimized_kprobe((unsigned long)p->addr);
676 if (unlikely(old_p))
677 unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
678
679 arch_arm_kprobe(p);
680 optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
681}
682
683static void __kprobes __disarm_kprobe(struct kprobe *p)
684{
685 struct kprobe *old_p;
686
687 unoptimize_kprobe(p); /* Try to unoptimize */
688 arch_disarm_kprobe(p);
689
690 /* If another kprobe was blocked, optimize it. */
691 old_p = get_optimized_kprobe((unsigned long)p->addr);
692 if (unlikely(old_p))
693 optimize_kprobe(old_p);
694}
695
696#else /* !CONFIG_OPTPROBES */
697
698#define optimize_kprobe(p) do {} while (0)
699#define unoptimize_kprobe(p) do {} while (0)
700#define kill_optimized_kprobe(p) do {} while (0)
701#define prepare_optimized_kprobe(p) do {} while (0)
702#define try_to_optimize_kprobe(p) do {} while (0)
703#define __arm_kprobe(p) arch_arm_kprobe(p)
704#define __disarm_kprobe(p) arch_disarm_kprobe(p)
705
706static __kprobes void free_aggr_kprobe(struct kprobe *p)
707{
708 kfree(p);
709}
710
711static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
712{
713 return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
714}
715#endif /* CONFIG_OPTPROBES */
716
313/* Arm a kprobe with text_mutex */ 717/* Arm a kprobe with text_mutex */
314static void __kprobes arm_kprobe(struct kprobe *kp) 718static void __kprobes arm_kprobe(struct kprobe *kp)
315{ 719{
720 /*
721 * Here, since __arm_kprobe() doesn't use stop_machine(),
722 * this doesn't cause deadlock on text_mutex. So, we don't
723 * need get_online_cpus().
724 */
316 mutex_lock(&text_mutex); 725 mutex_lock(&text_mutex);
317 arch_arm_kprobe(kp); 726 __arm_kprobe(kp);
318 mutex_unlock(&text_mutex); 727 mutex_unlock(&text_mutex);
319} 728}
320 729
321/* Disarm a kprobe with text_mutex */ 730/* Disarm a kprobe with text_mutex */
322static void __kprobes disarm_kprobe(struct kprobe *kp) 731static void __kprobes disarm_kprobe(struct kprobe *kp)
323{ 732{
733 get_online_cpus(); /* For avoiding text_mutex deadlock */
324 mutex_lock(&text_mutex); 734 mutex_lock(&text_mutex);
325 arch_disarm_kprobe(kp); 735 __disarm_kprobe(kp);
326 mutex_unlock(&text_mutex); 736 mutex_unlock(&text_mutex);
737 put_online_cpus();
327} 738}
328 739
329/* 740/*
@@ -392,7 +803,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
392void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) 803void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
393{ 804{
394 struct kprobe *kp; 805 struct kprobe *kp;
395 if (p->pre_handler != aggr_pre_handler) { 806 if (!kprobe_aggrprobe(p)) {
396 p->nmissed++; 807 p->nmissed++;
397 } else { 808 } else {
398 list_for_each_entry_rcu(kp, &p->list, list) 809 list_for_each_entry_rcu(kp, &p->list, list)
@@ -516,21 +927,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
516} 927}
517 928
518/* 929/*
519 * Keep all fields in the kprobe consistent
520 */
521static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
522{
523 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
524 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
525}
526
527/*
528* Add the new probe to ap->list. Fail if this is the 930* Add the new probe to ap->list. Fail if this is the
529* second jprobe at the address - two jprobes can't coexist 931* second jprobe at the address - two jprobes can't coexist
530*/ 932*/
531static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) 933static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
532{ 934{
533 BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); 935 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
936
937 if (p->break_handler || p->post_handler)
938 unoptimize_kprobe(ap); /* Fall back to normal kprobe */
939
534 if (p->break_handler) { 940 if (p->break_handler) {
535 if (ap->break_handler) 941 if (ap->break_handler)
536 return -EEXIST; 942 return -EEXIST;
@@ -545,7 +951,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
545 ap->flags &= ~KPROBE_FLAG_DISABLED; 951 ap->flags &= ~KPROBE_FLAG_DISABLED;
546 if (!kprobes_all_disarmed) 952 if (!kprobes_all_disarmed)
547 /* Arm the breakpoint again. */ 953 /* Arm the breakpoint again. */
548 arm_kprobe(ap); 954 __arm_kprobe(ap);
549 } 955 }
550 return 0; 956 return 0;
551} 957}
@@ -554,12 +960,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
554 * Fill in the required fields of the "manager kprobe". Replace the 960 * Fill in the required fields of the "manager kprobe". Replace the
555 * earlier kprobe in the hlist with the manager kprobe 961 * earlier kprobe in the hlist with the manager kprobe
556 */ 962 */
557static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 963static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
558{ 964{
965 /* Copy p's insn slot to ap */
559 copy_kprobe(p, ap); 966 copy_kprobe(p, ap);
560 flush_insn_slot(ap); 967 flush_insn_slot(ap);
561 ap->addr = p->addr; 968 ap->addr = p->addr;
562 ap->flags = p->flags; 969 ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
563 ap->pre_handler = aggr_pre_handler; 970 ap->pre_handler = aggr_pre_handler;
564 ap->fault_handler = aggr_fault_handler; 971 ap->fault_handler = aggr_fault_handler;
565 /* We don't care the kprobe which has gone. */ 972 /* We don't care the kprobe which has gone. */
@@ -569,8 +976,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
569 ap->break_handler = aggr_break_handler; 976 ap->break_handler = aggr_break_handler;
570 977
571 INIT_LIST_HEAD(&ap->list); 978 INIT_LIST_HEAD(&ap->list);
572 list_add_rcu(&p->list, &ap->list); 979 INIT_HLIST_NODE(&ap->hlist);
573 980
981 list_add_rcu(&p->list, &ap->list);
574 hlist_replace_rcu(&p->hlist, &ap->hlist); 982 hlist_replace_rcu(&p->hlist, &ap->hlist);
575} 983}
576 984
@@ -584,12 +992,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
584 int ret = 0; 992 int ret = 0;
585 struct kprobe *ap = old_p; 993 struct kprobe *ap = old_p;
586 994
587 if (old_p->pre_handler != aggr_pre_handler) { 995 if (!kprobe_aggrprobe(old_p)) {
588 /* If old_p is not an aggr_probe, create new aggr_kprobe. */ 996 /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
589 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 997 ap = alloc_aggr_kprobe(old_p);
590 if (!ap) 998 if (!ap)
591 return -ENOMEM; 999 return -ENOMEM;
592 add_aggr_kprobe(ap, old_p); 1000 init_aggr_kprobe(ap, old_p);
593 } 1001 }
594 1002
595 if (kprobe_gone(ap)) { 1003 if (kprobe_gone(ap)) {
@@ -608,6 +1016,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
608 */ 1016 */
609 return ret; 1017 return ret;
610 1018
1019 /* Prepare optimized instructions if possible. */
1020 prepare_optimized_kprobe(ap);
1021
611 /* 1022 /*
612 * Clear gone flag to prevent allocating new slot again, and 1023 * Clear gone flag to prevent allocating new slot again, and
613 * set disabled flag because it is not armed yet. 1024 * set disabled flag because it is not armed yet.
@@ -616,6 +1027,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
616 | KPROBE_FLAG_DISABLED; 1027 | KPROBE_FLAG_DISABLED;
617 } 1028 }
618 1029
1030 /* Copy ap's insn slot to p */
619 copy_kprobe(ap, p); 1031 copy_kprobe(ap, p);
620 return add_new_kprobe(ap, p); 1032 return add_new_kprobe(ap, p);
621} 1033}
@@ -728,7 +1140,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 1140
729 preempt_disable(); 1141 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 1142 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 1143 in_kprobes_functions((unsigned long) p->addr) ||
1144 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 1145 preempt_enable();
733 return -EINVAL; 1146 return -EINVAL;
734 } 1147 }
@@ -765,27 +1178,34 @@ int __kprobes register_kprobe(struct kprobe *p)
765 p->nmissed = 0; 1178 p->nmissed = 0;
766 INIT_LIST_HEAD(&p->list); 1179 INIT_LIST_HEAD(&p->list);
767 mutex_lock(&kprobe_mutex); 1180 mutex_lock(&kprobe_mutex);
1181
1182 get_online_cpus(); /* For avoiding text_mutex deadlock. */
1183 mutex_lock(&text_mutex);
1184
768 old_p = get_kprobe(p->addr); 1185 old_p = get_kprobe(p->addr);
769 if (old_p) { 1186 if (old_p) {
1187 /* Since this may unoptimize old_p, locking text_mutex. */
770 ret = register_aggr_kprobe(old_p, p); 1188 ret = register_aggr_kprobe(old_p, p);
771 goto out; 1189 goto out;
772 } 1190 }
773 1191
774 mutex_lock(&text_mutex);
775 ret = arch_prepare_kprobe(p); 1192 ret = arch_prepare_kprobe(p);
776 if (ret) 1193 if (ret)
777 goto out_unlock_text; 1194 goto out;
778 1195
779 INIT_HLIST_NODE(&p->hlist); 1196 INIT_HLIST_NODE(&p->hlist);
780 hlist_add_head_rcu(&p->hlist, 1197 hlist_add_head_rcu(&p->hlist,
781 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 1198 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
782 1199
783 if (!kprobes_all_disarmed && !kprobe_disabled(p)) 1200 if (!kprobes_all_disarmed && !kprobe_disabled(p))
784 arch_arm_kprobe(p); 1201 __arm_kprobe(p);
1202
1203 /* Try to optimize kprobe */
1204 try_to_optimize_kprobe(p);
785 1205
786out_unlock_text:
787 mutex_unlock(&text_mutex);
788out: 1206out:
1207 mutex_unlock(&text_mutex);
1208 put_online_cpus();
789 mutex_unlock(&kprobe_mutex); 1209 mutex_unlock(&kprobe_mutex);
790 1210
791 if (probed_mod) 1211 if (probed_mod)
@@ -807,7 +1227,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
807 return -EINVAL; 1227 return -EINVAL;
808 1228
809 if (old_p == p || 1229 if (old_p == p ||
810 (old_p->pre_handler == aggr_pre_handler && 1230 (kprobe_aggrprobe(old_p) &&
811 list_is_singular(&old_p->list))) { 1231 list_is_singular(&old_p->list))) {
812 /* 1232 /*
813 * Only probe on the hash list. Disarm only if kprobes are 1233 * Only probe on the hash list. Disarm only if kprobes are
@@ -815,7 +1235,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
815 * already have been removed. We save on flushing icache. 1235 * already have been removed. We save on flushing icache.
816 */ 1236 */
817 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) 1237 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
818 disarm_kprobe(p); 1238 disarm_kprobe(old_p);
819 hlist_del_rcu(&old_p->hlist); 1239 hlist_del_rcu(&old_p->hlist);
820 } else { 1240 } else {
821 if (p->break_handler && !kprobe_gone(p)) 1241 if (p->break_handler && !kprobe_gone(p))
@@ -831,8 +1251,13 @@ noclean:
831 list_del_rcu(&p->list); 1251 list_del_rcu(&p->list);
832 if (!kprobe_disabled(old_p)) { 1252 if (!kprobe_disabled(old_p)) {
833 try_to_disable_aggr_kprobe(old_p); 1253 try_to_disable_aggr_kprobe(old_p);
834 if (!kprobes_all_disarmed && kprobe_disabled(old_p)) 1254 if (!kprobes_all_disarmed) {
835 disarm_kprobe(old_p); 1255 if (kprobe_disabled(old_p))
1256 disarm_kprobe(old_p);
1257 else
1258 /* Try to optimize this probe again */
1259 optimize_kprobe(old_p);
1260 }
836 } 1261 }
837 } 1262 }
838 return 0; 1263 return 0;
@@ -849,7 +1274,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
849 old_p = list_entry(p->list.next, struct kprobe, list); 1274 old_p = list_entry(p->list.next, struct kprobe, list);
850 list_del(&p->list); 1275 list_del(&p->list);
851 arch_remove_kprobe(old_p); 1276 arch_remove_kprobe(old_p);
852 kfree(old_p); 1277 free_aggr_kprobe(old_p);
853 } 1278 }
854} 1279}
855 1280
@@ -1145,7 +1570,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1145 struct kprobe *kp; 1570 struct kprobe *kp;
1146 1571
1147 p->flags |= KPROBE_FLAG_GONE; 1572 p->flags |= KPROBE_FLAG_GONE;
1148 if (p->pre_handler == aggr_pre_handler) { 1573 if (kprobe_aggrprobe(p)) {
1149 /* 1574 /*
1150 * If this is an aggr_kprobe, we have to list all the 1575 * If this is an aggr_kprobe, we have to list all the
1151 * chained probes and mark them GONE. 1576 * chained probes and mark them GONE.
@@ -1154,6 +1579,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1154 kp->flags |= KPROBE_FLAG_GONE; 1579 kp->flags |= KPROBE_FLAG_GONE;
1155 p->post_handler = NULL; 1580 p->post_handler = NULL;
1156 p->break_handler = NULL; 1581 p->break_handler = NULL;
1582 kill_optimized_kprobe(p);
1157 } 1583 }
1158 /* 1584 /*
1159 * Here, we can remove insn_slot safely, because no thread calls 1585 * Here, we can remove insn_slot safely, because no thread calls
@@ -1162,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1162 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1163} 1589}
1164 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1165void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1166{ 1658{
1167 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1263,6 +1755,15 @@ static int __init init_kprobes(void)
1263 } 1755 }
1264 } 1756 }
1265 1757
1758#if defined(CONFIG_OPTPROBES)
1759#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
1760 /* Init kprobe_optinsn_slots */
1761 kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
1762#endif
1763 /* By default, kprobes can be optimized */
1764 kprobes_allow_optimization = true;
1765#endif
1766
1266 /* By default, kprobes are armed */ 1767 /* By default, kprobes are armed */
1267 kprobes_all_disarmed = false; 1768 kprobes_all_disarmed = false;
1268 1769
@@ -1281,7 +1782,7 @@ static int __init init_kprobes(void)
1281 1782
1282#ifdef CONFIG_DEBUG_FS 1783#ifdef CONFIG_DEBUG_FS
1283static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, 1784static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1284 const char *sym, int offset,char *modname) 1785 const char *sym, int offset, char *modname, struct kprobe *pp)
1285{ 1786{
1286 char *kprobe_type; 1787 char *kprobe_type;
1287 1788
@@ -1291,19 +1792,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1291 kprobe_type = "j"; 1792 kprobe_type = "j";
1292 else 1793 else
1293 kprobe_type = "k"; 1794 kprobe_type = "k";
1795
1294 if (sym) 1796 if (sym)
1295 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", 1797 seq_printf(pi, "%p %s %s+0x%x %s ",
1296 p->addr, kprobe_type, sym, offset, 1798 p->addr, kprobe_type, sym, offset,
1297 (modname ? modname : " "), 1799 (modname ? modname : " "));
1298 (kprobe_gone(p) ? "[GONE]" : ""),
1299 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1300 "[DISABLED]" : ""));
1301 else 1800 else
1302 seq_printf(pi, "%p %s %p %s%s\n", 1801 seq_printf(pi, "%p %s %p ",
1303 p->addr, kprobe_type, p->addr, 1802 p->addr, kprobe_type, p->addr);
1304 (kprobe_gone(p) ? "[GONE]" : ""), 1803
1305 ((kprobe_disabled(p) && !kprobe_gone(p)) ? 1804 if (!pp)
1306 "[DISABLED]" : "")); 1805 pp = p;
1806 seq_printf(pi, "%s%s%s\n",
1807 (kprobe_gone(p) ? "[GONE]" : ""),
1808 ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
1809 (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
1307} 1810}
1308 1811
1309static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1812static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1339,11 +1842,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1339 hlist_for_each_entry_rcu(p, node, head, hlist) { 1842 hlist_for_each_entry_rcu(p, node, head, hlist) {
1340 sym = kallsyms_lookup((unsigned long)p->addr, NULL, 1843 sym = kallsyms_lookup((unsigned long)p->addr, NULL,
1341 &offset, &modname, namebuf); 1844 &offset, &modname, namebuf);
1342 if (p->pre_handler == aggr_pre_handler) { 1845 if (kprobe_aggrprobe(p)) {
1343 list_for_each_entry_rcu(kp, &p->list, list) 1846 list_for_each_entry_rcu(kp, &p->list, list)
1344 report_probe(pi, kp, sym, offset, modname); 1847 report_probe(pi, kp, sym, offset, modname, p);
1345 } else 1848 } else
1346 report_probe(pi, p, sym, offset, modname); 1849 report_probe(pi, p, sym, offset, modname, NULL);
1347 } 1850 }
1348 preempt_enable(); 1851 preempt_enable();
1349 return 0; 1852 return 0;
@@ -1368,71 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1368 .release = seq_release, 1871 .release = seq_release,
1369}; 1872};
1370 1873
1371/* Disable one kprobe */
1372int __kprobes disable_kprobe(struct kprobe *kp)
1373{
1374 int ret = 0;
1375 struct kprobe *p;
1376
1377 mutex_lock(&kprobe_mutex);
1378
1379 /* Check whether specified probe is valid. */
1380 p = __get_valid_kprobe(kp);
1381 if (unlikely(p == NULL)) {
1382 ret = -EINVAL;
1383 goto out;
1384 }
1385
1386 /* If the probe is already disabled (or gone), just return */
1387 if (kprobe_disabled(kp))
1388 goto out;
1389
1390 kp->flags |= KPROBE_FLAG_DISABLED;
1391 if (p != kp)
1392 /* When kp != p, p is always enabled. */
1393 try_to_disable_aggr_kprobe(p);
1394
1395 if (!kprobes_all_disarmed && kprobe_disabled(p))
1396 disarm_kprobe(p);
1397out:
1398 mutex_unlock(&kprobe_mutex);
1399 return ret;
1400}
1401EXPORT_SYMBOL_GPL(disable_kprobe);
1402
1403/* Enable one kprobe */
1404int __kprobes enable_kprobe(struct kprobe *kp)
1405{
1406 int ret = 0;
1407 struct kprobe *p;
1408
1409 mutex_lock(&kprobe_mutex);
1410
1411 /* Check whether specified probe is valid. */
1412 p = __get_valid_kprobe(kp);
1413 if (unlikely(p == NULL)) {
1414 ret = -EINVAL;
1415 goto out;
1416 }
1417
1418 if (kprobe_gone(kp)) {
1419 /* This kprobe has gone, we couldn't enable it. */
1420 ret = -EINVAL;
1421 goto out;
1422 }
1423
1424 if (!kprobes_all_disarmed && kprobe_disabled(p))
1425 arm_kprobe(p);
1426
1427 p->flags &= ~KPROBE_FLAG_DISABLED;
1428 if (p != kp)
1429 kp->flags &= ~KPROBE_FLAG_DISABLED;
1430out:
1431 mutex_unlock(&kprobe_mutex);
1432 return ret;
1433}
1434EXPORT_SYMBOL_GPL(enable_kprobe);
1435
1436static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1437{ 1875{
1438 struct hlist_head *head; 1876 struct hlist_head *head;
@@ -1446,12 +1884,13 @@ static void __kprobes arm_all_kprobes(void)
1446 if (!kprobes_all_disarmed) 1884 if (!kprobes_all_disarmed)
1447 goto already_enabled; 1885 goto already_enabled;
1448 1886
1887 /* Arming kprobes doesn't optimize kprobe itself */
1449 mutex_lock(&text_mutex); 1888 mutex_lock(&text_mutex);
1450 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1889 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1451 head = &kprobe_table[i]; 1890 head = &kprobe_table[i];
1452 hlist_for_each_entry_rcu(p, node, head, hlist) 1891 hlist_for_each_entry_rcu(p, node, head, hlist)
1453 if (!kprobe_disabled(p)) 1892 if (!kprobe_disabled(p))
1454 arch_arm_kprobe(p); 1893 __arm_kprobe(p);
1455 } 1894 }
1456 mutex_unlock(&text_mutex); 1895 mutex_unlock(&text_mutex);
1457 1896
@@ -1478,16 +1917,23 @@ static void __kprobes disarm_all_kprobes(void)
1478 1917
1479 kprobes_all_disarmed = true; 1918 kprobes_all_disarmed = true;
1480 printk(KERN_INFO "Kprobes globally disabled\n"); 1919 printk(KERN_INFO "Kprobes globally disabled\n");
1920
1921 /*
1922 * Here we call get_online_cpus() for avoiding text_mutex deadlock,
1923 * because disarming may also unoptimize kprobes.
1924 */
1925 get_online_cpus();
1481 mutex_lock(&text_mutex); 1926 mutex_lock(&text_mutex);
1482 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1927 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1483 head = &kprobe_table[i]; 1928 head = &kprobe_table[i];
1484 hlist_for_each_entry_rcu(p, node, head, hlist) { 1929 hlist_for_each_entry_rcu(p, node, head, hlist) {
1485 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) 1930 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1486 arch_disarm_kprobe(p); 1931 __disarm_kprobe(p);
1487 } 1932 }
1488 } 1933 }
1489 1934
1490 mutex_unlock(&text_mutex); 1935 mutex_unlock(&text_mutex);
1936 put_online_cpus();
1491 mutex_unlock(&kprobe_mutex); 1937 mutex_unlock(&kprobe_mutex);
1492 /* Allow all currently running kprobes to complete */ 1938 /* Allow all currently running kprobes to complete */
1493 synchronize_sched(); 1939 synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
33} 33}
34KERNEL_ATTR_RO(uevent_seqnum); 34KERNEL_ATTR_RO(uevent_seqnum);
35 35
36/* uevent helper program, used during early boo */ 36/* uevent helper program, used during early boot */
37static ssize_t uevent_helper_show(struct kobject *kobj, 37static ssize_t uevent_helper_show(struct kobject *kobj,
38 struct kobj_attribute *attr, char *buf) 38 struct kobj_attribute *attr, char *buf)
39{ 39{
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
@@ -197,16 +198,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 198 goto group_exit;
198 } 199 }
199 200
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 201 return 0;
206 202
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 203group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 204 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 205kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/slab.h>
60#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
61 60
62static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 5feaddcdbe49..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h>
46 47
47#include <asm/sections.h> 48#include <asm/sections.h>
48 49
@@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
430/* 431/*
431 * Various lockdep statistics: 432 * Various lockdep statistics:
432 */ 433 */
433atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
434atomic_t chain_lookup_misses;
435atomic_t hardirqs_on_events;
436atomic_t hardirqs_off_events;
437atomic_t redundant_hardirqs_on;
438atomic_t redundant_hardirqs_off;
439atomic_t softirqs_on_events;
440atomic_t softirqs_off_events;
441atomic_t redundant_softirqs_on;
442atomic_t redundant_softirqs_off;
443atomic_t nr_unused_locks;
444atomic_t nr_cyclic_checks;
445atomic_t nr_find_usage_forwards_checks;
446atomic_t nr_find_usage_backwards_checks;
447#endif 435#endif
448 436
449/* 437/*
@@ -582,9 +570,6 @@ static int static_obj(void *obj)
582 unsigned long start = (unsigned long) &_stext, 570 unsigned long start = (unsigned long) &_stext,
583 end = (unsigned long) &_end, 571 end = (unsigned long) &_end,
584 addr = (unsigned long) obj; 572 addr = (unsigned long) obj;
585#ifdef CONFIG_SMP
586 int i;
587#endif
588 573
589 /* 574 /*
590 * static variable? 575 * static variable?
@@ -595,24 +580,16 @@ static int static_obj(void *obj)
595 if (arch_is_kernel_data(addr)) 580 if (arch_is_kernel_data(addr))
596 return 1; 581 return 1;
597 582
598#ifdef CONFIG_SMP
599 /* 583 /*
600 * percpu var? 584 * in-kernel percpu var?
601 */ 585 */
602 for_each_possible_cpu(i) { 586 if (is_kernel_percpu_address(addr))
603 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 587 return 1;
604 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
605 + per_cpu_offset(i);
606
607 if ((addr >= start) && (addr < end))
608 return 1;
609 }
610#endif
611 588
612 /* 589 /*
613 * module var? 590 * module static or percpu var?
614 */ 591 */
615 return is_module_address(addr); 592 return is_module_address(addr) || is_module_percpu_address(addr);
616} 593}
617 594
618/* 595/*
@@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
758 return NULL; 735 return NULL;
759 } 736 }
760 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
761 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
762 class->key = key; 739 class->key = key;
763 class->name = lock->name; 740 class->name = lock->name;
764 class->subclass = subclass; 741 class->subclass = subclass;
@@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
828 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
829 */ 806 */
830static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
831 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
832{ 810{
833 struct lock_list *entry; 811 struct lock_list *entry;
834 /* 812 /*
@@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
839 if (!entry) 817 if (!entry)
840 return 0; 818 return 0;
841 819
842 if (!save_trace(&entry->trace))
843 return 0;
844
845 entry->class = this; 820 entry->class = this;
846 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
847 /* 823 /*
848 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
849 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1215{ 1191{
1216 int result; 1192 int result;
1217 1193
1218 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1219 1195
1220 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1221 1197
@@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1252{ 1228{
1253 int result; 1229 int result;
1254 1230
1255 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1256 1232
1257 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1258 1234
@@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1275{ 1251{
1276 int result; 1252 int result;
1277 1253
1278 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1279 1255
1280 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1281 1257
@@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1645 */ 1621 */
1646static int 1622static int
1647check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1648 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1649{ 1625{
1650 struct lock_list *entry; 1626 struct lock_list *entry;
1651 int ret; 1627 int ret;
1652 struct lock_list this; 1628 struct lock_list this;
1653 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1654 1638
1655 /* 1639 /*
1656 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1698 } 1682 }
1699 } 1683 }
1700 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1701 /* 1688 /*
1702 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1703 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1704 */ 1691 */
1705 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1706 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1707 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1708 1695
1709 if (!ret) 1696 if (!ret)
1710 return 0; 1697 return 0;
1711 1698
1712 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1713 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1714 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1715 if (!ret) 1702 if (!ret)
1716 return 0; 1703 return 0;
1717 1704
@@ -1741,6 +1728,7 @@ static int
1741check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1742{ 1729{
1743 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1744 struct held_lock *hlock; 1732 struct held_lock *hlock;
1745 1733
1746 /* 1734 /*
@@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1766 * added: 1754 * added:
1767 */ 1755 */
1768 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1769 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1770 return 0; 1759 return 0;
1771 /* 1760 /*
1772 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1789 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1790 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1791 break; 1780 break;
1781 trylock_loop = 1;
1792 } 1782 }
1793 return 1; 1783 return 1;
1794out_bug: 1784out_bug:
@@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1835 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1836 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1837cache_hit: 1827cache_hit:
1838 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1839 if (very_verbose(class)) 1829 if (very_verbose(class))
1840 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1841 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1890,7 @@ cache_hit:
1900 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1901 } 1891 }
1902 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1903 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1904 inc_chains(); 1894 inc_chains();
1905 1895
1906 return 1; 1896 return 1;
@@ -2147,7 +2137,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2147 return ret; 2137 return ret;
2148 2138
2149 return print_irq_inversion_bug(curr, &root, target_entry, 2139 return print_irq_inversion_bug(curr, &root, target_entry,
2150 this, 1, irqclass); 2140 this, 0, irqclass);
2151} 2141}
2152 2142
2153void print_irqtrace_events(struct task_struct *curr) 2143void print_irqtrace_events(struct task_struct *curr)
@@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2321 return; 2311 return;
2322 2312
2323 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2324 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2325 return; 2320 return;
2326 } 2321 }
2327 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2348 2343
2349 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2350 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2351 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2352} 2347}
2353EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2354 2349
@@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2380 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2381 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2382 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2383 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2384 } else 2379 } else
2385 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2386} 2381}
2387EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2388 2383
@@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2406 return; 2401 return;
2407 2402
2408 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2409 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2410 return; 2405 return;
2411 } 2406 }
2412 2407
@@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2416 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2417 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2418 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2419 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2420 /* 2415 /*
2421 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2422 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2446 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2447 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2448 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2449 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2450 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2451 } else 2446 } else
2452 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2453} 2448}
2454 2449
2455static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2654 return 0; 2649 return 0;
2655 break; 2650 break;
2656 case LOCK_USED: 2651 case LOCK_USED:
2657 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2658 break; 2653 break;
2659 default: 2654 default:
2660 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2716,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2716} 2711}
2717EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2718 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2719/* 2716/*
2720 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2721 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2750,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 return 0; 2747 return 0;
2751 } 2748 }
2752 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2753 if (!subclass) 2753 if (!subclass)
2754 class = lock->class_cache; 2754 class = lock->class_cache;
2755 /* 2755 /*
@@ -2760,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2760 if (!class) 2760 if (!class)
2761 return 0; 2761 return 0;
2762 } 2762 }
2763 debug_atomic_inc((atomic_t *)&class->ops); 2763 atomic_inc((atomic_t *)&class->ops);
2764 if (very_verbose(class)) { 2764 if (very_verbose(class)) {
2765 printk("\nacquire class [%p] %s", class->key, class->name); 2765 printk("\nacquire class [%p] %s", class->key, class->name);
2766 if (class->name_version > 1) 2766 if (class->name_version > 1)
@@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3211{ 3211{
3212 unsigned long flags; 3212 unsigned long flags;
3213 3213
3214 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3215
3216 if (unlikely(current->lockdep_recursion)) 3214 if (unlikely(current->lockdep_recursion))
3217 return; 3215 return;
3218 3216
@@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3220 check_flags(flags); 3218 check_flags(flags);
3221 3219
3222 current->lockdep_recursion = 1; 3220 current->lockdep_recursion = 1;
3221 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3223 __lock_acquire(lock, subclass, trylock, read, check, 3222 __lock_acquire(lock, subclass, trylock, read, check,
3224 irqs_disabled_flags(flags), nest_lock, ip, 0); 3223 irqs_disabled_flags(flags), nest_lock, ip, 0);
3225 current->lockdep_recursion = 0; 3224 current->lockdep_recursion = 0;
@@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested,
3232{ 3231{
3233 unsigned long flags; 3232 unsigned long flags;
3234 3233
3235 trace_lock_release(lock, nested, ip);
3236
3237 if (unlikely(current->lockdep_recursion)) 3234 if (unlikely(current->lockdep_recursion))
3238 return; 3235 return;
3239 3236
3240 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3241 check_flags(flags); 3238 check_flags(flags);
3242 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3240 trace_lock_release(lock, ip);
3243 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3244 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3245 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3392,7 +3390,7 @@ found_it:
3392 hlock->holdtime_stamp = now; 3390 hlock->holdtime_stamp = now;
3393 } 3391 }
3394 3392
3395 trace_lock_acquired(lock, ip, waittime); 3393 trace_lock_acquired(lock, ip);
3396 3394
3397 stats = get_lock_stats(hlock_class(hlock)); 3395 stats = get_lock_stats(hlock_class(hlock));
3398 if (waittime) { 3396 if (waittime) {
@@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3413{ 3411{
3414 unsigned long flags; 3412 unsigned long flags;
3415 3413
3416 trace_lock_contended(lock, ip);
3417
3418 if (unlikely(!lock_stat)) 3414 if (unlikely(!lock_stat))
3419 return; 3415 return;
3420 3416
@@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3424 raw_local_irq_save(flags); 3420 raw_local_irq_save(flags);
3425 check_flags(flags); 3421 check_flags(flags);
3426 current->lockdep_recursion = 1; 3422 current->lockdep_recursion = 1;
3423 trace_lock_contended(lock, ip);
3427 __lock_contended(lock, ip); 3424 __lock_contended(lock, ip);
3428 current->lockdep_recursion = 0; 3425 current->lockdep_recursion = 0;
3429 raw_local_irq_restore(flags); 3426 raw_local_irq_restore(flags);
@@ -3809,3 +3806,25 @@ void lockdep_sys_exit(void)
3809 lockdep_print_held_locks(curr); 3806 lockdep_print_held_locks(curr);
3810 } 3807 }
3811} 3808}
3809
3810void lockdep_rcu_dereference(const char *file, const int line)
3811{
3812 struct task_struct *curr = current;
3813
3814#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3815 if (!debug_locks_off())
3816 return;
3817#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3818 /* Note: the following can be executed concurrently, so be careful. */
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
3826 lockdep_print_held_locks(curr);
3827 printk("\nstack backtrace:\n");
3828 dump_stack();
3829}
3830EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 5daf0abd63c1..6c562828c85c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
74/* If this is set, the section belongs in the init part of the module */ 72/* If this is set, the section belongs in the init part of the module */
75#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
76 74
77/* List of modules, protected by module_mutex or preempt_disable 75/*
76 * Mutex protects:
77 * 1) List of modules (also safely readable with preempt_disable),
78 * 2) module_use links,
79 * 3) module_addr_min/module_addr_max.
78 * (delete uses stop_machine/add uses RCU list operations). */ 80 * (delete uses stop_machine/add uses RCU list operations). */
79DEFINE_MUTEX(module_mutex); 81DEFINE_MUTEX(module_mutex);
80EXPORT_SYMBOL_GPL(module_mutex); 82EXPORT_SYMBOL_GPL(module_mutex);
81static LIST_HEAD(modules); 83static LIST_HEAD(modules);
84#ifdef CONFIG_KGDB_KDB
85struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
86#endif /* CONFIG_KGDB_KDB */
87
82 88
83/* Block module loading/unloading? */ 89/* Block module loading/unloading? */
84int modules_disabled = 0; 90int modules_disabled = 0;
@@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
88 94
89static BLOCKING_NOTIFIER_HEAD(module_notify_list); 95static BLOCKING_NOTIFIER_HEAD(module_notify_list);
90 96
91/* Bounds of module allocation, for speeding __module_address */ 97/* Bounds of module allocation, for speeding __module_address.
98 * Protected by module_mutex. */
92static unsigned long module_addr_min = -1UL, module_addr_max = 0; 99static unsigned long module_addr_min = -1UL, module_addr_max = 0;
93 100
94int register_module_notifier(struct notifier_block * nb) 101int register_module_notifier(struct notifier_block * nb)
@@ -178,8 +185,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
178extern const struct kernel_symbol __stop___ksymtab_gpl[]; 185extern const struct kernel_symbol __stop___ksymtab_gpl[];
179extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 186extern const struct kernel_symbol __start___ksymtab_gpl_future[];
180extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 187extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const unsigned long __start___kcrctab[]; 188extern const unsigned long __start___kcrctab[];
184extern const unsigned long __start___kcrctab_gpl[]; 189extern const unsigned long __start___kcrctab_gpl[];
185extern const unsigned long __start___kcrctab_gpl_future[]; 190extern const unsigned long __start___kcrctab_gpl_future[];
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
329} 334}
330 335
331/* Find a symbol and return it, along with, (optional) crc and 336/* Find a symbol and return it, along with, (optional) crc and
332 * (optional) module which owns it */ 337 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
333const struct kernel_symbol *find_symbol(const char *name, 338const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 339 struct module **owner,
335 const unsigned long **crc, 340 const unsigned long **crc,
@@ -370,27 +375,33 @@ EXPORT_SYMBOL_GPL(find_module);
370 375
371#ifdef CONFIG_SMP 376#ifdef CONFIG_SMP
372 377
373static void *percpu_modalloc(unsigned long size, unsigned long align, 378static inline void __percpu *mod_percpu(struct module *mod)
374 const char *name)
375{ 379{
376 void *ptr; 380 return mod->percpu;
381}
377 382
383static int percpu_modalloc(struct module *mod,
384 unsigned long size, unsigned long align)
385{
378 if (align > PAGE_SIZE) { 386 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 387 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE); 388 mod->name, align, PAGE_SIZE);
381 align = PAGE_SIZE; 389 align = PAGE_SIZE;
382 } 390 }
383 391
384 ptr = __alloc_reserved_percpu(size, align); 392 mod->percpu = __alloc_reserved_percpu(size, align);
385 if (!ptr) 393 if (!mod->percpu) {
386 printk(KERN_WARNING 394 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size); 395 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr; 396 return -ENOMEM;
397 }
398 mod->percpu_size = size;
399 return 0;
389} 400}
390 401
391static void percpu_modfree(void *freeme) 402static void percpu_modfree(struct module *mod)
392{ 403{
393 free_percpu(freeme); 404 free_percpu(mod->percpu);
394} 405}
395 406
396static unsigned int find_pcpusec(Elf_Ehdr *hdr, 407static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +411,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
400 return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); 411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
401} 412}
402 413
403static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) 414static void percpu_modcopy(struct module *mod,
415 const void *from, unsigned long size)
404{ 416{
405 int cpu; 417 int cpu;
406 418
407 for_each_possible_cpu(cpu) 419 for_each_possible_cpu(cpu)
408 memcpy(pcpudest + per_cpu_offset(cpu), from, size); 420 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
421}
422
423/**
424 * is_module_percpu_address - test whether address is from module static percpu
425 * @addr: address to test
426 *
427 * Test whether @addr belongs to module static percpu area.
428 *
429 * RETURNS:
430 * %true if @addr is from module static percpu area
431 */
432bool is_module_percpu_address(unsigned long addr)
433{
434 struct module *mod;
435 unsigned int cpu;
436
437 preempt_disable();
438
439 list_for_each_entry_rcu(mod, &modules, list) {
440 if (!mod->percpu_size)
441 continue;
442 for_each_possible_cpu(cpu) {
443 void *start = per_cpu_ptr(mod->percpu, cpu);
444
445 if ((void *)addr >= start &&
446 (void *)addr < start + mod->percpu_size) {
447 preempt_enable();
448 return true;
449 }
450 }
451 }
452
453 preempt_enable();
454 return false;
409} 455}
410 456
411#else /* ... !CONFIG_SMP */ 457#else /* ... !CONFIG_SMP */
412 458
413static inline void *percpu_modalloc(unsigned long size, unsigned long align, 459static inline void __percpu *mod_percpu(struct module *mod)
414 const char *name)
415{ 460{
416 return NULL; 461 return NULL;
417} 462}
418static inline void percpu_modfree(void *pcpuptr) 463static inline int percpu_modalloc(struct module *mod,
464 unsigned long size, unsigned long align)
465{
466 return -ENOMEM;
467}
468static inline void percpu_modfree(struct module *mod)
419{ 469{
420 BUG();
421} 470}
422static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
423 Elf_Shdr *sechdrs, 472 Elf_Shdr *sechdrs,
@@ -425,12 +474,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
425{ 474{
426 return 0; 475 return 0;
427} 476}
428static inline void percpu_modcopy(void *pcpudst, const void *src, 477static inline void percpu_modcopy(struct module *mod,
429 unsigned long size) 478 const void *from, unsigned long size)
430{ 479{
431 /* pcpusec should be 0, and size of that section should be 0. */ 480 /* pcpusec should be 0, and size of that section should be 0. */
432 BUG_ON(size != 0); 481 BUG_ON(size != 0);
433} 482}
483bool is_module_percpu_address(unsigned long addr)
484{
485 return false;
486}
434 487
435#endif /* CONFIG_SMP */ 488#endif /* CONFIG_SMP */
436 489
@@ -467,34 +520,34 @@ MODINFO_ATTR(srcversion);
467static char last_unloaded_module[MODULE_NAME_LEN+1]; 520static char last_unloaded_module[MODULE_NAME_LEN+1];
468 521
469#ifdef CONFIG_MODULE_UNLOAD 522#ifdef CONFIG_MODULE_UNLOAD
523
524EXPORT_TRACEPOINT_SYMBOL(module_get);
525
470/* Init the unload section of the module. */ 526/* Init the unload section of the module. */
471static void module_unload_init(struct module *mod) 527static void module_unload_init(struct module *mod)
472{ 528{
473 int cpu; 529 int cpu;
474 530
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 531 INIT_LIST_HEAD(&mod->source_list);
476 for_each_possible_cpu(cpu) 532 INIT_LIST_HEAD(&mod->target_list);
477 local_set(__module_ref_addr(mod, cpu), 0); 533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
537
478 /* Hold reference count during initialization. */ 538 /* Hold reference count during initialization. */
479 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); 539 __this_cpu_write(mod->refptr->incs, 1);
480 /* Backwards compatibility macros put refcount during init. */ 540 /* Backwards compatibility macros put refcount during init. */
481 mod->waiter = current; 541 mod->waiter = current;
482} 542}
483 543
484/* modules using other modules */
485struct module_use
486{
487 struct list_head list;
488 struct module *module_which_uses;
489};
490
491/* Does a already use b? */ 544/* Does a already use b? */
492static int already_uses(struct module *a, struct module *b) 545static int already_uses(struct module *a, struct module *b)
493{ 546{
494 struct module_use *use; 547 struct module_use *use;
495 548
496 list_for_each_entry(use, &b->modules_which_use_me, list) { 549 list_for_each_entry(use, &b->source_list, source_list) {
497 if (use->module_which_uses == a) { 550 if (use->source == a) {
498 DEBUGP("%s uses %s!\n", a->name, b->name); 551 DEBUGP("%s uses %s!\n", a->name, b->name);
499 return 1; 552 return 1;
500 } 553 }
@@ -503,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
503 return 0; 556 return 0;
504} 557}
505 558
506/* Module a uses b */ 559/*
507int use_module(struct module *a, struct module *b) 560 * Module a uses b
561 * - we add 'a' as a "source", 'b' as a "target" of module use
562 * - the module_use is added to the list of 'b' sources (so
563 * 'b' can walk the list to see who sourced them), and of 'a'
564 * targets (so 'a' can see what modules it targets).
565 */
566static int add_module_usage(struct module *a, struct module *b)
508{ 567{
509 struct module_use *use; 568 struct module_use *use;
510 int no_warn, err;
511 569
512 if (b == NULL || already_uses(a, b)) return 1; 570 DEBUGP("Allocating new usage for %s.\n", a->name);
571 use = kmalloc(sizeof(*use), GFP_ATOMIC);
572 if (!use) {
573 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
574 return -ENOMEM;
575 }
513 576
514 /* If we're interrupted or time out, we fail. */ 577 use->source = a;
515 if (wait_event_interruptible_timeout( 578 use->target = b;
516 module_wq, (err = strong_try_module_get(b)) != -EBUSY, 579 list_add(&use->source_list, &b->source_list);
517 30 * HZ) <= 0) { 580 list_add(&use->target_list, &a->target_list);
518 printk("%s: gave up waiting for init of module %s.\n", 581 return 0;
519 a->name, b->name); 582}
583
584/* Module a uses b: caller needs module_mutex() */
585int ref_module(struct module *a, struct module *b)
586{
587 int err;
588
589 if (b == NULL || already_uses(a, b))
520 return 0; 590 return 0;
521 }
522 591
523 /* If strong_try_module_get() returned a different error, we fail. */ 592 /* If module isn't available, we fail. */
593 err = strong_try_module_get(b);
524 if (err) 594 if (err)
525 return 0; 595 return err;
526 596
527 DEBUGP("Allocating new usage for %s.\n", a->name); 597 err = add_module_usage(a, b);
528 use = kmalloc(sizeof(*use), GFP_ATOMIC); 598 if (err) {
529 if (!use) {
530 printk("%s: out of memory loading\n", a->name);
531 module_put(b); 599 module_put(b);
532 return 0; 600 return err;
533 } 601 }
534 602 return 0;
535 use->module_which_uses = a;
536 list_add(&use->list, &b->modules_which_use_me);
537 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
538 return 1;
539} 603}
540EXPORT_SYMBOL_GPL(use_module); 604EXPORT_SYMBOL_GPL(ref_module);
541 605
542/* Clear the unload stuff of the module. */ 606/* Clear the unload stuff of the module. */
543static void module_unload_free(struct module *mod) 607static void module_unload_free(struct module *mod)
544{ 608{
545 struct module *i; 609 struct module_use *use, *tmp;
546
547 list_for_each_entry(i, &modules, list) {
548 struct module_use *use;
549 610
550 list_for_each_entry(use, &i->modules_which_use_me, list) { 611 mutex_lock(&module_mutex);
551 if (use->module_which_uses == mod) { 612 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
552 DEBUGP("%s unusing %s\n", mod->name, i->name); 613 struct module *i = use->target;
553 module_put(i); 614 DEBUGP("%s unusing %s\n", mod->name, i->name);
554 list_del(&use->list); 615 module_put(i);
555 kfree(use); 616 list_del(&use->source_list);
556 sysfs_remove_link(i->holders_dir, mod->name); 617 list_del(&use->target_list);
557 /* There can be at most one match. */ 618 kfree(use);
558 break;
559 }
560 }
561 } 619 }
620 mutex_unlock(&module_mutex);
562} 621}
563 622
564#ifdef CONFIG_MODULE_FORCE_UNLOAD 623#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -615,12 +674,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
615 674
616unsigned int module_refcount(struct module *mod) 675unsigned int module_refcount(struct module *mod)
617{ 676{
618 unsigned int total = 0; 677 unsigned int incs = 0, decs = 0;
619 int cpu; 678 int cpu;
620 679
621 for_each_possible_cpu(cpu) 680 for_each_possible_cpu(cpu)
622 total += local_read(__module_ref_addr(mod, cpu)); 681 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
623 return total; 682 /*
683 * ensure the incs are added up after the decs.
684 * module_put ensures incs are visible before decs with smp_wmb.
685 *
686 * This 2-count scheme avoids the situation where the refcount
687 * for CPU0 is read, then CPU0 increments the module refcount,
688 * then CPU1 drops that refcount, then the refcount for CPU1 is
689 * read. We would record a decrement but not its corresponding
690 * increment so we would see a low count (disaster).
691 *
692 * Rare situation? But module_refcount can be preempted, and we
693 * might be tallying up 4096+ CPUs. So it is not impossible.
694 */
695 smp_rmb();
696 for_each_possible_cpu(cpu)
697 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
698 return incs - decs;
624} 699}
625EXPORT_SYMBOL(module_refcount); 700EXPORT_SYMBOL(module_refcount);
626 701
@@ -656,16 +731,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
656 return -EFAULT; 731 return -EFAULT;
657 name[MODULE_NAME_LEN-1] = '\0'; 732 name[MODULE_NAME_LEN-1] = '\0';
658 733
659 /* Create stop_machine threads since free_module relies on 734 if (mutex_lock_interruptible(&module_mutex) != 0)
660 * a non-failing stop_machine call. */ 735 return -EINTR;
661 ret = stop_machine_create();
662 if (ret)
663 return ret;
664
665 if (mutex_lock_interruptible(&module_mutex) != 0) {
666 ret = -EINTR;
667 goto out_stop;
668 }
669 736
670 mod = find_module(name); 737 mod = find_module(name);
671 if (!mod) { 738 if (!mod) {
@@ -673,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
673 goto out; 740 goto out;
674 } 741 }
675 742
676 if (!list_empty(&mod->modules_which_use_me)) { 743 if (!list_empty(&mod->source_list)) {
677 /* Other modules depend on us: get rid of them first. */ 744 /* Other modules depend on us: get rid of them first. */
678 ret = -EWOULDBLOCK; 745 ret = -EWOULDBLOCK;
679 goto out; 746 goto out;
@@ -717,16 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
717 blocking_notifier_call_chain(&module_notify_list, 784 blocking_notifier_call_chain(&module_notify_list,
718 MODULE_STATE_GOING, mod); 785 MODULE_STATE_GOING, mod);
719 async_synchronize_full(); 786 async_synchronize_full();
720 mutex_lock(&module_mutex); 787
721 /* Store the name of the last unloaded module for diagnostic purposes */ 788 /* Store the name of the last unloaded module for diagnostic purposes */
722 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
723 ddebug_remove_module(mod->name);
724 free_module(mod);
725 790
726 out: 791 free_module(mod);
792 return 0;
793out:
727 mutex_unlock(&module_mutex); 794 mutex_unlock(&module_mutex);
728out_stop:
729 stop_machine_destroy();
730 return ret; 795 return ret;
731} 796}
732 797
@@ -739,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
739 804
740 /* Always include a trailing , so userspace can differentiate 805 /* Always include a trailing , so userspace can differentiate
741 between this and the old multi-field proc format. */ 806 between this and the old multi-field proc format. */
742 list_for_each_entry(use, &mod->modules_which_use_me, list) { 807 list_for_each_entry(use, &mod->source_list, source_list) {
743 printed_something = 1; 808 printed_something = 1;
744 seq_printf(m, "%s,", use->module_which_uses->name); 809 seq_printf(m, "%s,", use->source->name);
745 } 810 }
746 811
747 if (mod->init != NULL && mod->exit == NULL) { 812 if (mod->init != NULL && mod->exit == NULL) {
@@ -796,14 +861,15 @@ static struct module_attribute refcnt = {
796void module_put(struct module *module) 861void module_put(struct module *module)
797{ 862{
798 if (module) { 863 if (module) {
799 unsigned int cpu = get_cpu(); 864 preempt_disable();
800 local_dec(__module_ref_addr(module, cpu)); 865 smp_wmb(); /* see comment in module_refcount */
801 trace_module_put(module, _RET_IP_, 866 __this_cpu_inc(module->refptr->decs);
802 local_read(__module_ref_addr(module, cpu))); 867
868 trace_module_put(module, _RET_IP_);
803 /* Maybe they're waiting for us to drop reference? */ 869 /* Maybe they're waiting for us to drop reference? */
804 if (unlikely(!module_is_live(module))) 870 if (unlikely(!module_is_live(module)))
805 wake_up_process(module->waiter); 871 wake_up_process(module->waiter);
806 put_cpu(); 872 preempt_enable();
807 } 873 }
808} 874}
809EXPORT_SYMBOL(module_put); 875EXPORT_SYMBOL(module_put);
@@ -819,11 +885,11 @@ static inline void module_unload_free(struct module *mod)
819{ 885{
820} 886}
821 887
822int use_module(struct module *a, struct module *b) 888int ref_module(struct module *a, struct module *b)
823{ 889{
824 return strong_try_module_get(b) == 0; 890 return strong_try_module_get(b);
825} 891}
826EXPORT_SYMBOL_GPL(use_module); 892EXPORT_SYMBOL_GPL(ref_module);
827 893
828static inline void module_unload_init(struct module *mod) 894static inline void module_unload_init(struct module *mod)
829{ 895{
@@ -940,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
940{ 1006{
941 const unsigned long *crc; 1007 const unsigned long *crc;
942 1008
1009 /* Since this should be found in kernel (which can't be removed),
1010 * no locking is necessary. */
943 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1011 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
944 &crc, true, false)) 1012 &crc, true, false))
945 BUG(); 1013 BUG();
@@ -982,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
982} 1050}
983#endif /* CONFIG_MODVERSIONS */ 1051#endif /* CONFIG_MODVERSIONS */
984 1052
985/* Resolve a symbol for this module. I.e. if we find one, record usage. 1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */
986 Must be holding module_mutex. */
987static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
988 unsigned int versindex, 1055 unsigned int versindex,
989 const char *name, 1056 const char *name,
990 struct module *mod) 1057 struct module *mod,
1058 char ownername[])
991{ 1059{
992 struct module *owner; 1060 struct module *owner;
993 const struct kernel_symbol *sym; 1061 const struct kernel_symbol *sym;
994 const unsigned long *crc; 1062 const unsigned long *crc;
1063 int err;
995 1064
1065 mutex_lock(&module_mutex);
996 sym = find_symbol(name, &owner, &crc, 1066 sym = find_symbol(name, &owner, &crc,
997 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1067 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
998 /* use_module can fail due to OOM, 1068 if (!sym)
999 or module initialization or unloading */ 1069 goto unlock;
1000 if (sym) { 1070
1001 if (!check_version(sechdrs, versindex, name, mod, crc, owner) 1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
1002 || !use_module(mod, owner)) 1072 sym = ERR_PTR(-EINVAL);
1003 sym = NULL; 1073 goto getname;
1074 }
1075
1076 err = ref_module(mod, owner);
1077 if (err) {
1078 sym = ERR_PTR(err);
1079 goto getname;
1004 } 1080 }
1081
1082getname:
1083 /* We must make copy under the lock if we failed to get ref. */
1084 strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
1085unlock:
1086 mutex_unlock(&module_mutex);
1005 return sym; 1087 return sym;
1006} 1088}
1007 1089
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1091 unsigned int versindex,
1092 const char *name,
1093 struct module *mod)
1094{
1095 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN];
1097
1098 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
1100 mod, ownername)) ||
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername);
1105 }
1106 return ksym;
1107}
1108
1008/* 1109/*
1009 * /sys/module/foo/sections stuff 1110 * /sys/module/foo/sections stuff
1010 * J. Corbet <corbet@lwn.net> 1111 * J. Corbet <corbet@lwn.net>
@@ -1083,6 +1184,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1083 if (sattr->name == NULL) 1184 if (sattr->name == NULL)
1084 goto out; 1185 goto out;
1085 sect_attrs->nsections++; 1186 sect_attrs->nsections++;
1187 sysfs_attr_init(&sattr->mattr.attr);
1086 sattr->mattr.show = module_sect_show; 1188 sattr->mattr.show = module_sect_show;
1087 sattr->mattr.store = NULL; 1189 sattr->mattr.store = NULL;
1088 sattr->mattr.attr.name = sattr->name; 1190 sattr->mattr.attr.name = sattr->name;
@@ -1122,7 +1224,7 @@ struct module_notes_attrs {
1122 struct bin_attribute attrs[0]; 1224 struct bin_attribute attrs[0];
1123}; 1225};
1124 1226
1125static ssize_t module_notes_read(struct kobject *kobj, 1227static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1126 struct bin_attribute *bin_attr, 1228 struct bin_attribute *bin_attr,
1127 char *buf, loff_t pos, size_t count) 1229 char *buf, loff_t pos, size_t count)
1128{ 1230{
@@ -1178,6 +1280,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1178 if (sect_empty(&sechdrs[i])) 1280 if (sect_empty(&sechdrs[i]))
1179 continue; 1281 continue;
1180 if (sechdrs[i].sh_type == SHT_NOTE) { 1282 if (sechdrs[i].sh_type == SHT_NOTE) {
1283 sysfs_bin_attr_init(nattr);
1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1284 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
1182 nattr->attr.mode = S_IRUGO; 1285 nattr->attr.mode = S_IRUGO;
1183 nattr->size = sechdrs[i].sh_size; 1286 nattr->size = sechdrs[i].sh_size;
@@ -1232,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod)
1232#endif 1335#endif
1233 1336
1234#ifdef CONFIG_SYSFS 1337#ifdef CONFIG_SYSFS
1235int module_add_modinfo_attrs(struct module *mod) 1338static void add_usage_links(struct module *mod)
1339{
1340#ifdef CONFIG_MODULE_UNLOAD
1341 struct module_use *use;
1342 int nowarn;
1343
1344 mutex_lock(&module_mutex);
1345 list_for_each_entry(use, &mod->target_list, target_list) {
1346 nowarn = sysfs_create_link(use->target->holders_dir,
1347 &mod->mkobj.kobj, mod->name);
1348 }
1349 mutex_unlock(&module_mutex);
1350#endif
1351}
1352
1353static void del_usage_links(struct module *mod)
1354{
1355#ifdef CONFIG_MODULE_UNLOAD
1356 struct module_use *use;
1357
1358 mutex_lock(&module_mutex);
1359 list_for_each_entry(use, &mod->target_list, target_list)
1360 sysfs_remove_link(use->target->holders_dir, mod->name);
1361 mutex_unlock(&module_mutex);
1362#endif
1363}
1364
1365static int module_add_modinfo_attrs(struct module *mod)
1236{ 1366{
1237 struct module_attribute *attr; 1367 struct module_attribute *attr;
1238 struct module_attribute *temp_attr; 1368 struct module_attribute *temp_attr;
@@ -1250,6 +1380,7 @@ int module_add_modinfo_attrs(struct module *mod)
1250 if (!attr->test || 1380 if (!attr->test ||
1251 (attr->test && attr->test(mod))) { 1381 (attr->test && attr->test(mod))) {
1252 memcpy(temp_attr, attr, sizeof(*temp_attr)); 1382 memcpy(temp_attr, attr, sizeof(*temp_attr));
1383 sysfs_attr_init(&temp_attr->attr);
1253 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); 1384 error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
1254 ++temp_attr; 1385 ++temp_attr;
1255 } 1386 }
@@ -1257,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod)
1257 return error; 1388 return error;
1258} 1389}
1259 1390
1260void module_remove_modinfo_attrs(struct module *mod) 1391static void module_remove_modinfo_attrs(struct module *mod)
1261{ 1392{
1262 struct module_attribute *attr; 1393 struct module_attribute *attr;
1263 int i; 1394 int i;
@@ -1273,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1273 kfree(mod->modinfo_attrs); 1404 kfree(mod->modinfo_attrs);
1274} 1405}
1275 1406
1276int mod_sysfs_init(struct module *mod) 1407static int mod_sysfs_init(struct module *mod)
1277{ 1408{
1278 int err; 1409 int err;
1279 struct kobject *kobj; 1410 struct kobject *kobj;
@@ -1307,12 +1438,16 @@ out:
1307 return err; 1438 return err;
1308} 1439}
1309 1440
1310int mod_sysfs_setup(struct module *mod, 1441static int mod_sysfs_setup(struct module *mod,
1311 struct kernel_param *kparam, 1442 struct kernel_param *kparam,
1312 unsigned int num_params) 1443 unsigned int num_params)
1313{ 1444{
1314 int err; 1445 int err;
1315 1446
1447 err = mod_sysfs_init(mod);
1448 if (err)
1449 goto out;
1450
1316 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); 1451 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1317 if (!mod->holders_dir) { 1452 if (!mod->holders_dir) {
1318 err = -ENOMEM; 1453 err = -ENOMEM;
@@ -1327,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod,
1327 if (err) 1462 if (err)
1328 goto out_unreg_param; 1463 goto out_unreg_param;
1329 1464
1465 add_usage_links(mod);
1466
1330 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1331 return 0; 1468 return 0;
1332 1469
@@ -1336,6 +1473,7 @@ out_unreg_holders:
1336 kobject_put(mod->holders_dir); 1473 kobject_put(mod->holders_dir);
1337out_unreg: 1474out_unreg:
1338 kobject_put(&mod->mkobj.kobj); 1475 kobject_put(&mod->mkobj.kobj);
1476out:
1339 return err; 1477 return err;
1340} 1478}
1341 1479
@@ -1346,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod)
1346 1484
1347#else /* CONFIG_SYSFS */ 1485#else /* CONFIG_SYSFS */
1348 1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491
1492static inline int mod_sysfs_setup(struct module *mod,
1493 struct kernel_param *kparam,
1494 unsigned int num_params)
1495{
1496 return 0;
1497}
1498
1499static inline int module_add_modinfo_attrs(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{
1506}
1507
1349static void mod_sysfs_fini(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1350{ 1509{
1351} 1510}
1352 1511
1512static void del_usage_links(struct module *mod)
1513{
1514}
1515
1353#endif /* CONFIG_SYSFS */ 1516#endif /* CONFIG_SYSFS */
1354 1517
1355static void mod_kobject_remove(struct module *mod) 1518static void mod_kobject_remove(struct module *mod)
1356{ 1519{
1520 del_usage_links(mod);
1357 module_remove_modinfo_attrs(mod); 1521 module_remove_modinfo_attrs(mod);
1358 module_param_sysfs_remove(mod); 1522 module_param_sysfs_remove(mod);
1359 kobject_put(mod->mkobj.drivers_dir); 1523 kobject_put(mod->mkobj.drivers_dir);
@@ -1372,17 +1536,22 @@ static int __unlink_module(void *_mod)
1372 return 0; 1536 return 0;
1373} 1537}
1374 1538
1375/* Free a module, remove from lists, etc (must hold module_mutex). */ 1539/* Free a module, remove from lists, etc. */
1376static void free_module(struct module *mod) 1540static void free_module(struct module *mod)
1377{ 1541{
1378 trace_module_free(mod); 1542 trace_module_free(mod);
1379 1543
1380 /* Delete from various lists */ 1544 /* Delete from various lists */
1545 mutex_lock(&module_mutex);
1381 stop_machine(__unlink_module, mod, NULL); 1546 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex);
1382 remove_notes_attrs(mod); 1548 remove_notes_attrs(mod);
1383 remove_sect_attrs(mod); 1549 remove_sect_attrs(mod);
1384 mod_kobject_remove(mod); 1550 mod_kobject_remove(mod);
1385 1551
1552 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name);
1554
1386 /* Arch-specific cleanup. */ 1555 /* Arch-specific cleanup. */
1387 module_arch_cleanup(mod); 1556 module_arch_cleanup(mod);
1388 1557
@@ -1395,11 +1564,10 @@ static void free_module(struct module *mod)
1395 /* This may be NULL, but that's OK */ 1564 /* This may be NULL, but that's OK */
1396 module_free(mod, mod->module_init); 1565 module_free(mod, mod->module_init);
1397 kfree(mod->args); 1566 kfree(mod->args);
1398 if (mod->percpu) 1567 percpu_modfree(mod);
1399 percpu_modfree(mod->percpu); 1568#if defined(CONFIG_MODULE_UNLOAD)
1400#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1401 if (mod->refptr) 1569 if (mod->refptr)
1402 percpu_modfree(mod->refptr); 1570 free_percpu(mod->refptr);
1403#endif 1571#endif
1404 /* Free lock-classes: */ 1572 /* Free lock-classes: */
1405 lockdep_free_key_range(mod->module_core, mod->core_size); 1573 lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1430,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1430/* 1598/*
1431 * Ensure that an exported symbol [global namespace] does not already exist 1599 * Ensure that an exported symbol [global namespace] does not already exist
1432 * in the kernel or in some other module's exported symbol table. 1600 * in the kernel or in some other module's exported symbol table.
1601 *
1602 * You must hold the module_mutex.
1433 */ 1603 */
1434static int verify_export_symbols(struct module *mod) 1604static int verify_export_symbols(struct module *mod)
1435{ 1605{
@@ -1495,27 +1665,29 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1495 break; 1665 break;
1496 1666
1497 case SHN_UNDEF: 1667 case SHN_UNDEF:
1498 ksym = resolve_symbol(sechdrs, versindex, 1668 ksym = resolve_symbol_wait(sechdrs, versindex,
1499 strtab + sym[i].st_name, mod); 1669 strtab + sym[i].st_name,
1670 mod);
1500 /* Ok if resolved. */ 1671 /* Ok if resolved. */
1501 if (ksym) { 1672 if (ksym && !IS_ERR(ksym)) {
1502 sym[i].st_value = ksym->value; 1673 sym[i].st_value = ksym->value;
1503 break; 1674 break;
1504 } 1675 }
1505 1676
1506 /* Ok if weak. */ 1677 /* Ok if weak. */
1507 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1678 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1508 break; 1679 break;
1509 1680
1510 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1511 mod->name, strtab + sym[i].st_name); 1682 mod->name, strtab + sym[i].st_name,
1512 ret = -ENOENT; 1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT;
1513 break; 1685 break;
1514 1686
1515 default: 1687 default:
1516 /* Divert to percpu allocation if a percpu var. */ 1688 /* Divert to percpu allocation if a percpu var. */
1517 if (sym[i].st_shndx == pcpuindex) 1689 if (sym[i].st_shndx == pcpuindex)
1518 secbase = (unsigned long)mod->percpu; 1690 secbase = (unsigned long)mod_percpu(mod);
1519 else 1691 else
1520 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1692 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1521 sym[i].st_value += secbase; 1693 sym[i].st_value += secbase;
@@ -1892,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1892#endif 2064#endif
1893} 2065}
1894 2066
2067static void dynamic_debug_remove(struct _ddebug *debug)
2068{
2069 if (debug)
2070 ddebug_remove_module(debug->modname);
2071}
2072
1895static void *module_alloc_update_bounds(unsigned long size) 2073static void *module_alloc_update_bounds(unsigned long size)
1896{ 2074{
1897 void *ret = module_alloc(size); 2075 void *ret = module_alloc(size);
1898 2076
1899 if (ret) { 2077 if (ret) {
2078 mutex_lock(&module_mutex);
1900 /* Update module bounds. */ 2079 /* Update module bounds. */
1901 if ((unsigned long)ret < module_addr_min) 2080 if ((unsigned long)ret < module_addr_min)
1902 module_addr_min = (unsigned long)ret; 2081 module_addr_min = (unsigned long)ret;
1903 if ((unsigned long)ret + size > module_addr_max) 2082 if ((unsigned long)ret + size > module_addr_max)
1904 module_addr_max = (unsigned long)ret + size; 2083 module_addr_max = (unsigned long)ret + size;
2084 mutex_unlock(&module_mutex);
1905 } 2085 }
1906 return ret; 2086 return ret;
1907} 2087}
@@ -1949,8 +2129,11 @@ static noinline struct module *load_module(void __user *umod,
1949 unsigned int modindex, versindex, infoindex, pcpuindex; 2129 unsigned int modindex, versindex, infoindex, pcpuindex;
1950 struct module *mod; 2130 struct module *mod;
1951 long err = 0; 2131 long err = 0;
1952 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2132 void *ptr = NULL; /* Stops spurious gcc warning */
1953 unsigned long symoffs, stroffs, *strmap; 2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
1954 2137
1955 mm_segment_t old_fs; 2138 mm_segment_t old_fs;
1956 2139
@@ -2075,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
2075 goto free_mod; 2258 goto free_mod;
2076 } 2259 }
2077 2260
2078 if (find_module(mod->name)) {
2079 err = -EEXIST;
2080 goto free_mod;
2081 }
2082
2083 mod->state = MODULE_STATE_COMING; 2261 mod->state = MODULE_STATE_COMING;
2084 2262
2085 /* Allow arches to frob section contents and sizes. */ 2263 /* Allow arches to frob section contents and sizes. */
@@ -2089,16 +2267,14 @@ static noinline struct module *load_module(void __user *umod,
2089 2267
2090 if (pcpuindex) { 2268 if (pcpuindex) {
2091 /* We have a special allocation for this section. */ 2269 /* We have a special allocation for this section. */
2092 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2093 sechdrs[pcpuindex].sh_addralign, 2271 sechdrs[pcpuindex].sh_addralign);
2094 mod->name); 2272 if (err)
2095 if (!percpu) {
2096 err = -ENOMEM;
2097 goto free_mod; 2273 goto free_mod;
2098 }
2099 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2100 mod->percpu = percpu;
2101 } 2275 }
2276 /* Keep this around for failure path. */
2277 percpu = mod_percpu(mod);
2102 2278
2103 /* Determine total sizes, and put offsets in sh_entsize. For now 2279 /* Determine total sizes, and put offsets in sh_entsize. For now
2104 this is done generically; there doesn't appear to be any 2280 this is done generically; there doesn't appear to be any
@@ -2162,9 +2338,8 @@ static noinline struct module *load_module(void __user *umod,
2162 mod = (void *)sechdrs[modindex].sh_addr; 2338 mod = (void *)sechdrs[modindex].sh_addr;
2163 kmemleak_load_module(mod, hdr, sechdrs, secstrings); 2339 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2164 2340
2165#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2341#if defined(CONFIG_MODULE_UNLOAD)
2166 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2342 mod->refptr = alloc_percpu(struct module_ref);
2167 mod->name);
2168 if (!mod->refptr) { 2343 if (!mod->refptr) {
2169 err = -ENOMEM; 2344 err = -ENOMEM;
2170 goto free_init; 2345 goto free_init;
@@ -2173,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod,
2173 /* Now we've moved module, initialize linked lists, etc. */ 2348 /* Now we've moved module, initialize linked lists, etc. */
2174 module_unload_init(mod); 2349 module_unload_init(mod);
2175 2350
2176 /* add kobject, so we can reference it. */
2177 err = mod_sysfs_init(mod);
2178 if (err)
2179 goto free_unload;
2180
2181 /* Set up license info based on the info section */ 2351 /* Set up license info based on the info section */
2182 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
2183 2353
@@ -2302,18 +2472,13 @@ static noinline struct module *load_module(void __user *umod,
2302 goto cleanup; 2472 goto cleanup;
2303 } 2473 }
2304 2474
2305 /* Find duplicate symbols */
2306 err = verify_export_symbols(mod);
2307 if (err < 0)
2308 goto cleanup;
2309
2310 /* Set up and sort exception table */ 2475 /* Set up and sort exception table */
2311 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", 2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2312 sizeof(*mod->extable), &mod->num_exentries); 2477 sizeof(*mod->extable), &mod->num_exentries);
2313 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2314 2479
2315 /* Finally, copy percpu area over. */ 2480 /* Finally, copy percpu area over. */
2316 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2317 sechdrs[pcpuindex].sh_size); 2482 sechdrs[pcpuindex].sh_size);
2318 2483
2319 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, 2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2321,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod,
2321 kfree(strmap); 2486 kfree(strmap);
2322 strmap = NULL; 2487 strmap = NULL;
2323 2488
2324 if (!mod->taints) { 2489 if (!mod->taints)
2325 struct _ddebug *debug;
2326 unsigned int num_debug;
2327
2328 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2329 sizeof(*debug), &num_debug); 2491 sizeof(*debug), &num_debug);
2330 if (debug)
2331 dynamic_debug_setup(debug, num_debug);
2332 }
2333 2492
2334 err = module_finalize(hdr, sechdrs, mod); 2493 err = module_finalize(hdr, sechdrs, mod);
2335 if (err < 0) 2494 if (err < 0)
@@ -2365,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod,
2365 * function to insert in a way safe to concurrent readers. 2524 * function to insert in a way safe to concurrent readers.
2366 * The mutex protects against concurrent writers. 2525 * The mutex protects against concurrent writers.
2367 */ 2526 */
2527 mutex_lock(&module_mutex);
2528 if (find_module(mod->name)) {
2529 err = -EEXIST;
2530 goto unlock;
2531 }
2532
2533 if (debug)
2534 dynamic_debug_setup(debug, num_debug);
2535
2536 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod);
2538 if (err < 0)
2539 goto ddebug;
2540
2368 list_add_rcu(&mod->list, &modules); 2541 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex);
2369 2543
2370 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2371 if (err < 0) 2545 if (err < 0)
@@ -2374,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod,
2374 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2375 if (err < 0) 2549 if (err < 0)
2376 goto unlink; 2550 goto unlink;
2551
2377 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2378 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2379 2554
@@ -2386,18 +2561,20 @@ static noinline struct module *load_module(void __user *umod,
2386 return mod; 2561 return mod;
2387 2562
2388 unlink: 2563 unlink:
2564 mutex_lock(&module_mutex);
2389 /* Unlink carefully: kallsyms could be walking list. */ 2565 /* Unlink carefully: kallsyms could be walking list. */
2390 list_del_rcu(&mod->list); 2566 list_del_rcu(&mod->list);
2567 ddebug:
2568 dynamic_debug_remove(debug);
2569 unlock:
2570 mutex_unlock(&module_mutex);
2391 synchronize_sched(); 2571 synchronize_sched();
2392 module_arch_cleanup(mod); 2572 module_arch_cleanup(mod);
2393 cleanup: 2573 cleanup:
2394 free_modinfo(mod); 2574 free_modinfo(mod);
2395 kobject_del(&mod->mkobj.kobj);
2396 kobject_put(&mod->mkobj.kobj);
2397 free_unload:
2398 module_unload_free(mod); 2575 module_unload_free(mod);
2399#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2576#if defined(CONFIG_MODULE_UNLOAD)
2400 percpu_modfree(mod->refptr); 2577 free_percpu(mod->refptr);
2401 free_init: 2578 free_init:
2402#endif 2579#endif
2403 module_free(mod, mod->module_init); 2580 module_free(mod, mod->module_init);
@@ -2405,8 +2582,7 @@ static noinline struct module *load_module(void __user *umod,
2405 module_free(mod, mod->module_core); 2582 module_free(mod, mod->module_core);
2406 /* mod will be freed with core. Don't access it beyond this line! */ 2583 /* mod will be freed with core. Don't access it beyond this line! */
2407 free_percpu: 2584 free_percpu:
2408 if (percpu) 2585 free_percpu(percpu);
2409 percpu_modfree(percpu);
2410 free_mod: 2586 free_mod:
2411 kfree(args); 2587 kfree(args);
2412 kfree(strmap); 2588 kfree(strmap);
@@ -2442,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2442 if (!capable(CAP_SYS_MODULE) || modules_disabled) 2618 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2443 return -EPERM; 2619 return -EPERM;
2444 2620
2445 /* Only one module load at a time, please */
2446 if (mutex_lock_interruptible(&module_mutex) != 0)
2447 return -EINTR;
2448
2449 /* Do all the hard work */ 2621 /* Do all the hard work */
2450 mod = load_module(umod, len, uargs); 2622 mod = load_module(umod, len, uargs);
2451 if (IS_ERR(mod)) { 2623 if (IS_ERR(mod))
2452 mutex_unlock(&module_mutex);
2453 return PTR_ERR(mod); 2624 return PTR_ERR(mod);
2454 }
2455
2456 /* Drop lock so they can recurse */
2457 mutex_unlock(&module_mutex);
2458 2625
2459 blocking_notifier_call_chain(&module_notify_list, 2626 blocking_notifier_call_chain(&module_notify_list,
2460 MODULE_STATE_COMING, mod); 2627 MODULE_STATE_COMING, mod);
@@ -2471,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2471 module_put(mod); 2638 module_put(mod);
2472 blocking_notifier_call_chain(&module_notify_list, 2639 blocking_notifier_call_chain(&module_notify_list,
2473 MODULE_STATE_GOING, mod); 2640 MODULE_STATE_GOING, mod);
2474 mutex_lock(&module_mutex);
2475 free_module(mod); 2641 free_module(mod);
2476 mutex_unlock(&module_mutex);
2477 wake_up(&module_wq); 2642 wake_up(&module_wq);
2478 return ret; 2643 return ret;
2479 } 2644 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
172 struct thread_info *owner; 172 struct thread_info *owner;
173 173
174 /* 174 /*
175 * If we own the BKL, then don't spin. The owner of
176 * the mutex might be waiting on us to release the BKL.
177 */
178 if (unlikely(current->lock_depth >= 0))
179 break;
180
181 /*
175 * If there's an owner, wait for it to either 182 * If there's an owner, wait for it to either
176 * release the lock or go to sleep. 183 * release the lock or go to sleep.
177 */ 184 */
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
13 * Pavel Emelianov <xemul@openvz.org> 13 * Pavel Emelianov <xemul@openvz.org>
14 */ 14 */
15 15
16#include <linux/slab.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
18#include <linux/init_task.h> 19#include <linux/init_task.h>
@@ -24,7 +25,18 @@
24 25
25static struct kmem_cache *nsproxy_cachep; 26static struct kmem_cache *nsproxy_cachep;
26 27
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 28struct nsproxy init_nsproxy = {
29 .count = ATOMIC_INIT(1),
30 .uts_ns = &init_uts_ns,
31#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
32 .ipc_ns = &init_ipc_ns,
33#endif
34 .mnt_ns = NULL,
35 .pid_ns = &init_pid_ns,
36#ifdef CONFIG_NET
37 .net_ns = &init_net,
38#endif
39};
28 40
29static inline struct nsproxy *create_nsproxy(void) 41static inline struct nsproxy *create_nsproxy(void)
30{ 42{
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..fdd8ae609ce3
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,774 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/rcupdate.h>
30
31#define MAX_SEQ_NR INT_MAX - NR_CPUS
32#define MAX_OBJ_NUM 1000
33
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{
36 int cpu, target_cpu;
37
38 target_cpu = cpumask_first(pd->cpumask);
39 for (cpu = 0; cpu < cpu_index; cpu++)
40 target_cpu = cpumask_next(target_cpu, pd->cpumask);
41
42 return target_cpu;
43}
44
45static int padata_cpu_hash(struct padata_priv *padata)
46{
47 int cpu_index;
48 struct parallel_data *pd;
49
50 pd = padata->pd;
51
52 /*
53 * Hash the sequence numbers to the cpus by taking
54 * seq_nr mod. number of cpus in use.
55 */
56 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
57
58 return padata_index_to_cpu(pd, cpu_index);
59}
60
61static void padata_parallel_worker(struct work_struct *work)
62{
63 struct padata_queue *queue;
64 struct parallel_data *pd;
65 struct padata_instance *pinst;
66 LIST_HEAD(local_list);
67
68 local_bh_disable();
69 queue = container_of(work, struct padata_queue, pwork);
70 pd = queue->pd;
71 pinst = pd->pinst;
72
73 spin_lock(&queue->parallel.lock);
74 list_replace_init(&queue->parallel.list, &local_list);
75 spin_unlock(&queue->parallel.lock);
76
77 while (!list_empty(&local_list)) {
78 struct padata_priv *padata;
79
80 padata = list_entry(local_list.next,
81 struct padata_priv, list);
82
83 list_del_init(&padata->list);
84
85 padata->parallel(padata);
86 }
87
88 local_bh_enable();
89}
90
91/**
92 * padata_do_parallel - padata parallelization function
93 *
94 * @pinst: padata instance
95 * @padata: object to be parallelized
96 * @cb_cpu: cpu the serialization callback function will run on,
97 * must be in the cpumask of padata.
98 *
99 * The parallelization callback function will run with BHs off.
100 * Note: Every object which is parallelized by padata_do_parallel
101 * must be seen by padata_do_serial.
102 */
103int padata_do_parallel(struct padata_instance *pinst,
104 struct padata_priv *padata, int cb_cpu)
105{
106 int target_cpu, err;
107 struct padata_queue *queue;
108 struct parallel_data *pd;
109
110 rcu_read_lock_bh();
111
112 pd = rcu_dereference(pinst->pd);
113
114 err = 0;
115 if (!(pinst->flags & PADATA_INIT))
116 goto out;
117
118 err = -EBUSY;
119 if ((pinst->flags & PADATA_RESET))
120 goto out;
121
122 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
123 goto out;
124
125 err = -EINVAL;
126 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
127 goto out;
128
129 err = -EINPROGRESS;
130 atomic_inc(&pd->refcnt);
131 padata->pd = pd;
132 padata->cb_cpu = cb_cpu;
133
134 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
135 atomic_set(&pd->seq_nr, -1);
136
137 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
138
139 target_cpu = padata_cpu_hash(padata);
140 queue = per_cpu_ptr(pd->queue, target_cpu);
141
142 spin_lock(&queue->parallel.lock);
143 list_add_tail(&padata->list, &queue->parallel.list);
144 spin_unlock(&queue->parallel.lock);
145
146 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
147
148out:
149 rcu_read_unlock_bh();
150
151 return err;
152}
153EXPORT_SYMBOL(padata_do_parallel);
154
155/*
156 * padata_get_next - Get the next object that needs serialization.
157 *
158 * Return values are:
159 *
160 * A pointer to the control struct of the next object that needs
161 * serialization, if present in one of the percpu reorder queues.
162 *
163 * NULL, if all percpu reorder queues are empty.
164 *
165 * -EINPROGRESS, if the next object that needs serialization will
166 * be parallel processed by another cpu and is not yet present in
167 * the cpu's reorder queue.
168 *
169 * -ENODATA, if this cpu has to do the parallel processing for
170 * the next object.
171 */
172static struct padata_priv *padata_get_next(struct parallel_data *pd)
173{
174 int cpu, num_cpus, empty, calc_seq_nr;
175 int seq_nr, next_nr, overrun, next_overrun;
176 struct padata_queue *queue, *next_queue;
177 struct padata_priv *padata;
178 struct padata_list *reorder;
179
180 empty = 0;
181 next_nr = -1;
182 next_overrun = 0;
183 next_queue = NULL;
184
185 num_cpus = cpumask_weight(pd->cpumask);
186
187 for_each_cpu(cpu, pd->cpumask) {
188 queue = per_cpu_ptr(pd->queue, cpu);
189 reorder = &queue->reorder;
190
191 /*
192 * Calculate the seq_nr of the object that should be
193 * next in this reorder queue.
194 */
195 overrun = 0;
196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
197 + queue->cpu_index;
198
199 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
200 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
201 overrun = 1;
202 }
203
204 if (!list_empty(&reorder->list)) {
205 padata = list_entry(reorder->list.next,
206 struct padata_priv, list);
207
208 seq_nr = padata->seq_nr;
209 BUG_ON(calc_seq_nr != seq_nr);
210 } else {
211 seq_nr = calc_seq_nr;
212 empty++;
213 }
214
215 if (next_nr < 0 || seq_nr < next_nr
216 || (next_overrun && !overrun)) {
217 next_nr = seq_nr;
218 next_overrun = overrun;
219 next_queue = queue;
220 }
221 }
222
223 padata = NULL;
224
225 if (empty == num_cpus)
226 goto out;
227
228 reorder = &next_queue->reorder;
229
230 if (!list_empty(&reorder->list)) {
231 padata = list_entry(reorder->list.next,
232 struct padata_priv, list);
233
234 if (unlikely(next_overrun)) {
235 for_each_cpu(cpu, pd->cpumask) {
236 queue = per_cpu_ptr(pd->queue, cpu);
237 atomic_set(&queue->num_obj, 0);
238 }
239 }
240
241 spin_lock(&reorder->lock);
242 list_del_init(&padata->list);
243 atomic_dec(&pd->reorder_objects);
244 spin_unlock(&reorder->lock);
245
246 atomic_inc(&next_queue->num_obj);
247
248 goto out;
249 }
250
251 queue = per_cpu_ptr(pd->queue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) {
253 padata = ERR_PTR(-ENODATA);
254 goto out;
255 }
256
257 padata = ERR_PTR(-EINPROGRESS);
258out:
259 return padata;
260}
261
262static void padata_reorder(struct parallel_data *pd)
263{
264 struct padata_priv *padata;
265 struct padata_queue *queue;
266 struct padata_instance *pinst = pd->pinst;
267
268 /*
269 * We need to ensure that only one cpu can work on dequeueing of
270 * the reorder queue the time. Calculating in which percpu reorder
271 * queue the next object will arrive takes some time. A spinlock
272 * would be highly contended. Also it is not clear in which order
273 * the objects arrive to the reorder queues. So a cpu could wait to
274 * get the lock just to notice that there is nothing to do at the
275 * moment. Therefore we use a trylock and let the holder of the lock
276 * care for all the objects enqueued during the holdtime of the lock.
277 */
278 if (!spin_trylock_bh(&pd->lock))
279 return;
280
281 while (1) {
282 padata = padata_get_next(pd);
283
284 /*
285 * All reorder queues are empty, or the next object that needs
286 * serialization is parallel processed by another cpu and is
287 * still on it's way to the cpu's reorder queue, nothing to
288 * do for now.
289 */
290 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
291 break;
292
293 /*
294 * This cpu has to do the parallel processing of the next
295 * object. It's waiting in the cpu's parallelization queue,
296 * so exit imediately.
297 */
298 if (PTR_ERR(padata) == -ENODATA) {
299 del_timer(&pd->timer);
300 spin_unlock_bh(&pd->lock);
301 return;
302 }
303
304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
305
306 spin_lock(&queue->serial.lock);
307 list_add_tail(&padata->list, &queue->serial.list);
308 spin_unlock(&queue->serial.lock);
309
310 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
311 }
312
313 spin_unlock_bh(&pd->lock);
314
315 /*
316 * The next object that needs serialization might have arrived to
317 * the reorder queues in the meantime, we will be called again
318 * from the timer function if noone else cares for it.
319 */
320 if (atomic_read(&pd->reorder_objects)
321 && !(pinst->flags & PADATA_RESET))
322 mod_timer(&pd->timer, jiffies + HZ);
323 else
324 del_timer(&pd->timer);
325
326 return;
327}
328
329static void padata_reorder_timer(unsigned long arg)
330{
331 struct parallel_data *pd = (struct parallel_data *)arg;
332
333 padata_reorder(pd);
334}
335
336static void padata_serial_worker(struct work_struct *work)
337{
338 struct padata_queue *queue;
339 struct parallel_data *pd;
340 LIST_HEAD(local_list);
341
342 local_bh_disable();
343 queue = container_of(work, struct padata_queue, swork);
344 pd = queue->pd;
345
346 spin_lock(&queue->serial.lock);
347 list_replace_init(&queue->serial.list, &local_list);
348 spin_unlock(&queue->serial.lock);
349
350 while (!list_empty(&local_list)) {
351 struct padata_priv *padata;
352
353 padata = list_entry(local_list.next,
354 struct padata_priv, list);
355
356 list_del_init(&padata->list);
357
358 padata->serial(padata);
359 atomic_dec(&pd->refcnt);
360 }
361 local_bh_enable();
362}
363
364/**
365 * padata_do_serial - padata serialization function
366 *
367 * @padata: object to be serialized.
368 *
369 * padata_do_serial must be called for every parallelized object.
370 * The serialization callback function will run with BHs off.
371 */
372void padata_do_serial(struct padata_priv *padata)
373{
374 int cpu;
375 struct padata_queue *queue;
376 struct parallel_data *pd;
377
378 pd = padata->pd;
379
380 cpu = get_cpu();
381 queue = per_cpu_ptr(pd->queue, cpu);
382
383 spin_lock(&queue->reorder.lock);
384 atomic_inc(&pd->reorder_objects);
385 list_add_tail(&padata->list, &queue->reorder.list);
386 spin_unlock(&queue->reorder.lock);
387
388 put_cpu();
389
390 padata_reorder(pd);
391}
392EXPORT_SYMBOL(padata_do_serial);
393
394/* Allocate and initialize the internal cpumask dependend resources. */
395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
396 const struct cpumask *cpumask)
397{
398 int cpu, cpu_index, num_cpus;
399 struct padata_queue *queue;
400 struct parallel_data *pd;
401
402 cpu_index = 0;
403
404 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
405 if (!pd)
406 goto err;
407
408 pd->queue = alloc_percpu(struct padata_queue);
409 if (!pd->queue)
410 goto err_free_pd;
411
412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
413 goto err_free_queue;
414
415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
416
417 for_each_cpu(cpu, pd->cpumask) {
418 queue = per_cpu_ptr(pd->queue, cpu);
419
420 queue->pd = pd;
421
422 queue->cpu_index = cpu_index;
423 cpu_index++;
424
425 INIT_LIST_HEAD(&queue->reorder.list);
426 INIT_LIST_HEAD(&queue->parallel.list);
427 INIT_LIST_HEAD(&queue->serial.list);
428 spin_lock_init(&queue->reorder.lock);
429 spin_lock_init(&queue->parallel.lock);
430 spin_lock_init(&queue->serial.lock);
431
432 INIT_WORK(&queue->pwork, padata_parallel_worker);
433 INIT_WORK(&queue->swork, padata_serial_worker);
434 atomic_set(&queue->num_obj, 0);
435 }
436
437 num_cpus = cpumask_weight(pd->cpumask);
438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
439
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
441 atomic_set(&pd->seq_nr, -1);
442 atomic_set(&pd->reorder_objects, 0);
443 atomic_set(&pd->refcnt, 0);
444 pd->pinst = pinst;
445 spin_lock_init(&pd->lock);
446
447 return pd;
448
449err_free_queue:
450 free_percpu(pd->queue);
451err_free_pd:
452 kfree(pd);
453err:
454 return NULL;
455}
456
457static void padata_free_pd(struct parallel_data *pd)
458{
459 free_cpumask_var(pd->cpumask);
460 free_percpu(pd->queue);
461 kfree(pd);
462}
463
464/* Flush all objects out of the padata queues. */
465static void padata_flush_queues(struct parallel_data *pd)
466{
467 int cpu;
468 struct padata_queue *queue;
469
470 for_each_cpu(cpu, pd->cpumask) {
471 queue = per_cpu_ptr(pd->queue, cpu);
472 flush_work(&queue->pwork);
473 }
474
475 del_timer_sync(&pd->timer);
476
477 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd);
479
480 for_each_cpu(cpu, pd->cpumask) {
481 queue = per_cpu_ptr(pd->queue, cpu);
482 flush_work(&queue->swork);
483 }
484
485 BUG_ON(atomic_read(&pd->refcnt) != 0);
486}
487
488/* Replace the internal control stucture with a new one. */
489static void padata_replace(struct padata_instance *pinst,
490 struct parallel_data *pd_new)
491{
492 struct parallel_data *pd_old = pinst->pd;
493
494 pinst->flags |= PADATA_RESET;
495
496 rcu_assign_pointer(pinst->pd, pd_new);
497
498 synchronize_rcu();
499
500 padata_flush_queues(pd_old);
501 padata_free_pd(pd_old);
502
503 pinst->flags &= ~PADATA_RESET;
504}
505
506/**
507 * padata_set_cpumask - set the cpumask that padata should use
508 *
509 * @pinst: padata instance
510 * @cpumask: the cpumask to use
511 */
512int padata_set_cpumask(struct padata_instance *pinst,
513 cpumask_var_t cpumask)
514{
515 struct parallel_data *pd;
516 int err = 0;
517
518 mutex_lock(&pinst->lock);
519
520 get_online_cpus();
521
522 pd = padata_alloc_pd(pinst, cpumask);
523 if (!pd) {
524 err = -ENOMEM;
525 goto out;
526 }
527
528 cpumask_copy(pinst->cpumask, cpumask);
529
530 padata_replace(pinst, pd);
531
532out:
533 put_online_cpus();
534
535 mutex_unlock(&pinst->lock);
536
537 return err;
538}
539EXPORT_SYMBOL(padata_set_cpumask);
540
541static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
542{
543 struct parallel_data *pd;
544
545 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
546 pd = padata_alloc_pd(pinst, pinst->cpumask);
547 if (!pd)
548 return -ENOMEM;
549
550 padata_replace(pinst, pd);
551 }
552
553 return 0;
554}
555
556/**
557 * padata_add_cpu - add a cpu to the padata cpumask
558 *
559 * @pinst: padata instance
560 * @cpu: cpu to add
561 */
562int padata_add_cpu(struct padata_instance *pinst, int cpu)
563{
564 int err;
565
566 mutex_lock(&pinst->lock);
567
568 get_online_cpus();
569 cpumask_set_cpu(cpu, pinst->cpumask);
570 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus();
572
573 mutex_unlock(&pinst->lock);
574
575 return err;
576}
577EXPORT_SYMBOL(padata_add_cpu);
578
579static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
580{
581 struct parallel_data *pd;
582
583 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
584 pd = padata_alloc_pd(pinst, pinst->cpumask);
585 if (!pd)
586 return -ENOMEM;
587
588 padata_replace(pinst, pd);
589 }
590
591 return 0;
592}
593
594/**
595 * padata_remove_cpu - remove a cpu from the padata cpumask
596 *
597 * @pinst: padata instance
598 * @cpu: cpu to remove
599 */
600int padata_remove_cpu(struct padata_instance *pinst, int cpu)
601{
602 int err;
603
604 mutex_lock(&pinst->lock);
605
606 get_online_cpus();
607 cpumask_clear_cpu(cpu, pinst->cpumask);
608 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus();
610
611 mutex_unlock(&pinst->lock);
612
613 return err;
614}
615EXPORT_SYMBOL(padata_remove_cpu);
616
617/**
618 * padata_start - start the parallel processing
619 *
620 * @pinst: padata instance to start
621 */
622void padata_start(struct padata_instance *pinst)
623{
624 mutex_lock(&pinst->lock);
625 pinst->flags |= PADATA_INIT;
626 mutex_unlock(&pinst->lock);
627}
628EXPORT_SYMBOL(padata_start);
629
630/**
631 * padata_stop - stop the parallel processing
632 *
633 * @pinst: padata instance to stop
634 */
635void padata_stop(struct padata_instance *pinst)
636{
637 mutex_lock(&pinst->lock);
638 pinst->flags &= ~PADATA_INIT;
639 mutex_unlock(&pinst->lock);
640}
641EXPORT_SYMBOL(padata_stop);
642
643#ifdef CONFIG_HOTPLUG_CPU
644static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu)
646{
647 int err;
648 struct padata_instance *pinst;
649 int cpu = (unsigned long)hcpu;
650
651 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
652
653 switch (action) {
654 case CPU_ONLINE:
655 case CPU_ONLINE_FROZEN:
656 if (!cpumask_test_cpu(cpu, pinst->cpumask))
657 break;
658 mutex_lock(&pinst->lock);
659 err = __padata_add_cpu(pinst, cpu);
660 mutex_unlock(&pinst->lock);
661 if (err)
662 return notifier_from_errno(err);
663 break;
664
665 case CPU_DOWN_PREPARE:
666 case CPU_DOWN_PREPARE_FROZEN:
667 if (!cpumask_test_cpu(cpu, pinst->cpumask))
668 break;
669 mutex_lock(&pinst->lock);
670 err = __padata_remove_cpu(pinst, cpu);
671 mutex_unlock(&pinst->lock);
672 if (err)
673 return notifier_from_errno(err);
674 break;
675
676 case CPU_UP_CANCELED:
677 case CPU_UP_CANCELED_FROZEN:
678 if (!cpumask_test_cpu(cpu, pinst->cpumask))
679 break;
680 mutex_lock(&pinst->lock);
681 __padata_remove_cpu(pinst, cpu);
682 mutex_unlock(&pinst->lock);
683
684 case CPU_DOWN_FAILED:
685 case CPU_DOWN_FAILED_FROZEN:
686 if (!cpumask_test_cpu(cpu, pinst->cpumask))
687 break;
688 mutex_lock(&pinst->lock);
689 __padata_add_cpu(pinst, cpu);
690 mutex_unlock(&pinst->lock);
691 }
692
693 return NOTIFY_OK;
694}
695#endif
696
697/**
698 * padata_alloc - allocate and initialize a padata instance
699 *
700 * @cpumask: cpumask that padata uses for parallelization
701 * @wq: workqueue to use for the allocated padata instance
702 */
703struct padata_instance *padata_alloc(const struct cpumask *cpumask,
704 struct workqueue_struct *wq)
705{
706 struct padata_instance *pinst;
707 struct parallel_data *pd;
708
709 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
710 if (!pinst)
711 goto err;
712
713 get_online_cpus();
714
715 pd = padata_alloc_pd(pinst, cpumask);
716 if (!pd)
717 goto err_free_inst;
718
719 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
720 goto err_free_pd;
721
722 rcu_assign_pointer(pinst->pd, pd);
723
724 pinst->wq = wq;
725
726 cpumask_copy(pinst->cpumask, cpumask);
727
728 pinst->flags = 0;
729
730#ifdef CONFIG_HOTPLUG_CPU
731 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
732 pinst->cpu_notifier.priority = 0;
733 register_hotcpu_notifier(&pinst->cpu_notifier);
734#endif
735
736 put_online_cpus();
737
738 mutex_init(&pinst->lock);
739
740 return pinst;
741
742err_free_pd:
743 padata_free_pd(pd);
744err_free_inst:
745 kfree(pinst);
746 put_online_cpus();
747err:
748 return NULL;
749}
750EXPORT_SYMBOL(padata_alloc);
751
752/**
753 * padata_free - free a padata instance
754 *
755 * @padata_inst: padata instance to free
756 */
757void padata_free(struct padata_instance *pinst)
758{
759 padata_stop(pinst);
760
761 synchronize_rcu();
762
763#ifdef CONFIG_HOTPLUG_CPU
764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
770 padata_free_pd(pinst->pd);
771 free_cpumask_var(pinst->cpumask);
772 kfree(pinst);
773}
774EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
36 36
37EXPORT_SYMBOL(panic_notifier_list); 37EXPORT_SYMBOL(panic_notifier_list);
38 38
39static long no_blink(long time)
40{
41 return 0;
42}
43
44/* Returns how long it waited in ms */ 39/* Returns how long it waited in ms */
45long (*panic_blink)(long time); 40long (*panic_blink)(long time);
46EXPORT_SYMBOL(panic_blink); 41EXPORT_SYMBOL(panic_blink);
47 42
43static void panic_blink_one_second(void)
44{
45 static long i = 0, end;
46
47 if (panic_blink) {
48 end = i + MSEC_PER_SEC;
49
50 while (i < end) {
51 i += panic_blink(i);
52 mdelay(1);
53 i++;
54 }
55 } else {
56 /*
57 * When running under a hypervisor a small mdelay may get
58 * rounded up to the hypervisor timeslice. For example, with
59 * a 1ms in 10ms hypervisor timeslice we might inflate a
60 * mdelay(1) loop by 10x.
61 *
62 * If we have nothing to blink, spin on 1 second calls to
63 * mdelay to avoid this.
64 */
65 mdelay(MSEC_PER_SEC);
66 }
67}
68
48/** 69/**
49 * panic - halt the system 70 * panic - halt the system
50 * @fmt: The text string to print 71 * @fmt: The text string to print
@@ -66,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
66 */ 87 */
67 preempt_disable(); 88 preempt_disable();
68 89
90 console_verbose();
69 bust_spinlocks(1); 91 bust_spinlocks(1);
70 va_start(args, fmt); 92 va_start(args, fmt);
71 vsnprintf(buf, sizeof(buf), fmt, args); 93 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -95,9 +117,6 @@ NORET_TYPE void panic(const char * fmt, ...)
95 117
96 bust_spinlocks(0); 118 bust_spinlocks(0);
97 119
98 if (!panic_blink)
99 panic_blink = no_blink;
100
101 if (panic_timeout > 0) { 120 if (panic_timeout > 0) {
102 /* 121 /*
103 * Delay timeout seconds before rebooting the machine. 122 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +124,9 @@ NORET_TYPE void panic(const char * fmt, ...)
105 */ 124 */
106 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); 125 printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
107 126
108 for (i = 0; i < panic_timeout*1000; ) { 127 for (i = 0; i < panic_timeout; i++) {
109 touch_nmi_watchdog(); 128 touch_nmi_watchdog();
110 i += panic_blink(i); 129 panic_blink_one_second();
111 mdelay(1);
112 i++;
113 } 130 }
114 /* 131 /*
115 * This will not be a clean reboot, with everything 132 * This will not be a clean reboot, with everything
@@ -135,11 +152,9 @@ NORET_TYPE void panic(const char * fmt, ...)
135 } 152 }
136#endif 153#endif
137 local_irq_enable(); 154 local_irq_enable();
138 for (i = 0; ; ) { 155 while (1) {
139 touch_softlockup_watchdog(); 156 touch_softlockup_watchdog();
140 i += panic_blink(i); 157 panic_blink_one_second();
141 mdelay(1);
142 i++;
143 } 158 }
144} 159}
145 160
@@ -164,6 +179,7 @@ static const struct tnt tnts[] = {
164 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 179 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
165 { TAINT_WARN, 'W', ' ' }, 180 { TAINT_WARN, 'W', ' ' },
166 { TAINT_CRAP, 'C', ' ' }, 181 { TAINT_CRAP, 'C', ' ' },
182 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
167}; 183};
168 184
169/** 185/**
@@ -180,6 +196,7 @@ static const struct tnt tnts[] = {
180 * 'A' - ACPI table overridden. 196 * 'A' - ACPI table overridden.
181 * 'W' - Taint on warning. 197 * 'W' - Taint on warning.
182 * 'C' - modules from drivers/staging are loaded. 198 * 'C' - modules from drivers/staging are loaded.
199 * 'I' - Working around severe firmware bug.
183 * 200 *
184 * The string is overwritten by the next call to print_tainted(). 201 * The string is overwritten by the next call to print_tainted().
185 */ 202 */
@@ -351,7 +368,8 @@ struct slowpath_args {
351 va_list args; 368 va_list args;
352}; 369};
353 370
354static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) 371static void warn_slowpath_common(const char *file, int line, void *caller,
372 unsigned taint, struct slowpath_args *args)
355{ 373{
356 const char *board; 374 const char *board;
357 375
@@ -367,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
367 print_modules(); 385 print_modules();
368 dump_stack(); 386 dump_stack();
369 print_oops_end_marker(); 387 print_oops_end_marker();
370 add_taint(TAINT_WARN); 388 add_taint(taint);
371} 389}
372 390
373void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 391void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -376,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
376 394
377 args.fmt = fmt; 395 args.fmt = fmt;
378 va_start(args.args, fmt); 396 va_start(args.args, fmt);
379 warn_slowpath_common(file, line, __builtin_return_address(0), &args); 397 warn_slowpath_common(file, line, __builtin_return_address(0),
398 TAINT_WARN, &args);
380 va_end(args.args); 399 va_end(args.args);
381} 400}
382EXPORT_SYMBOL(warn_slowpath_fmt); 401EXPORT_SYMBOL(warn_slowpath_fmt);
383 402
403void warn_slowpath_fmt_taint(const char *file, int line,
404 unsigned taint, const char *fmt, ...)
405{
406 struct slowpath_args args;
407
408 args.fmt = fmt;
409 va_start(args.args, fmt);
410 warn_slowpath_common(file, line, __builtin_return_address(0),
411 taint, &args);
412 va_end(args.args);
413}
414EXPORT_SYMBOL(warn_slowpath_fmt_taint);
415
384void warn_slowpath_null(const char *file, int line) 416void warn_slowpath_null(const char *file, int line)
385{ 417{
386 warn_slowpath_common(file, line, __builtin_return_address(0), NULL); 418 warn_slowpath_common(file, line, __builtin_return_address(0),
419 TAINT_WARN, NULL);
387} 420}
388EXPORT_SYMBOL(warn_slowpath_null); 421EXPORT_SYMBOL(warn_slowpath_null);
389#endif 422#endif
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
28 27
29#if 0 28#if 0
30#define DEBUGP printk 29#define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
402} 401}
403 402
404/* sysfs output in /sys/modules/XYZ/parameters/ */ 403/* sysfs output in /sys/modules/XYZ/parameters/ */
405#define to_module_attr(n) container_of(n, struct module_attribute, attr); 404#define to_module_attr(n) container_of(n, struct module_attribute, attr)
406#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); 405#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
407 406
408extern struct kernel_param __start___param[], __stop___param[]; 407extern struct kernel_param __start___param[], __stop___param[];
409 408
@@ -421,7 +420,7 @@ struct module_param_attrs
421}; 420};
422 421
423#ifdef CONFIG_SYSFS 422#ifdef CONFIG_SYSFS
424#define to_param_attr(n) container_of(n, struct param_attribute, mattr); 423#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
425 424
426static ssize_t param_attr_show(struct module_attribute *mattr, 425static ssize_t param_attr_show(struct module_attribute *mattr,
427 struct module *mod, char *buf) 426 struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
517 new->grp.attrs = attrs; 516 new->grp.attrs = attrs;
518 517
519 /* Tack new one on the end. */ 518 /* Tack new one on the end. */
519 sysfs_attr_init(&new->attrs[num].mattr.attr);
520 new->attrs[num].param = kp; 520 new->attrs[num].param = kp;
521 new->attrs[num].mattr.show = param_attr_show; 521 new->attrs[num].mattr.show = param_attr_show;
522 new->attrs[num].mattr.store = param_attr_store; 522 new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
723 return ret; 723 return ret;
724} 724}
725 725
726static struct sysfs_ops module_sysfs_ops = { 726static const struct sysfs_ops module_sysfs_ops = {
727 .show = module_attr_show, 727 .show = module_attr_show,
728 .store = module_attr_store, 728 .store = module_attr_store,
729}; 729};
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
737 return 0; 737 return 0;
738} 738}
739 739
740static struct kset_uevent_ops module_uevent_ops = { 740static const struct kset_uevent_ops module_uevent_ops = {
741 .filter = uevent_filter, 741 .filter = uevent_filter,
742}; 742};
743 743
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index d27746bd3a06..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,8 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
18#include <linux/sysfs.h> 20#include <linux/sysfs.h>
19#include <linux/dcache.h> 21#include <linux/dcache.h>
20#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -56,21 +58,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 58 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 59int sysctl_perf_event_paranoid __read_mostly = 1;
58 60
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 61int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 62
76/* 63/*
@@ -96,40 +83,19 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
96void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
98 85
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu)
106{
107 return 0;
108}
109
110void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
111 87
112static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
113 89
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void) 90void perf_disable(void)
125{ 91{
126 __perf_disable(); 92 if (!__get_cpu_var(perf_disable_count)++)
127 hw_perf_disable(); 93 hw_perf_disable();
128} 94}
129 95
130void perf_enable(void) 96void perf_enable(void)
131{ 97{
132 if (__perf_enable()) 98 if (!--__get_cpu_var(perf_disable_count))
133 hw_perf_enable(); 99 hw_perf_enable();
134} 100}
135 101
@@ -248,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 214
249static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
250{ 216{
251 return cpu_clock(smp_processor_id()); 217 return cpu_clock(raw_smp_processor_id());
252} 218}
253 219
254/* 220/*
@@ -290,24 +256,49 @@ static void update_event_times(struct perf_event *event)
290} 256}
291 257
292/* 258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
270static struct list_head *
271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
272{
273 if (event->attr.pinned)
274 return &ctx->pinned_groups;
275 else
276 return &ctx->flexible_groups;
277}
278
279/*
293 * Add a event from the lists for its context. 280 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 281 * Must be called with ctx->mutex and ctx->lock held.
295 */ 282 */
296static void 283static void
297list_add_event(struct perf_event *event, struct perf_event_context *ctx) 284list_add_event(struct perf_event *event, struct perf_event_context *ctx)
298{ 285{
299 struct perf_event *group_leader = event->group_leader; 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
287 event->attach_state |= PERF_ATTACH_CONTEXT;
300 288
301 /* 289 /*
302 * Depending on whether it is a standalone or sibling event, 290 * If we're a stand alone event or group leader, we go to the context
303 * add it straight to the context's event list, or to the group 291 * list, group events are kept attached to the group so that
304 * leader's sibling list: 292 * perf_group_detach can, at all times, locate all siblings.
305 */ 293 */
306 if (group_leader == event) 294 if (event->group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 295 struct list_head *list;
308 else { 296
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 297 if (is_software_event(event))
310 group_leader->nr_siblings++; 298 event->group_flags |= PERF_GROUP_SOFTWARE;
299
300 list = ctx_group_list(event, ctx);
301 list_add_tail(&event->group_entry, list);
311 } 302 }
312 303
313 list_add_rcu(&event->event_entry, &ctx->event_list); 304 list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -316,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
316 ctx->nr_stat++; 307 ctx->nr_stat++;
317} 308}
318 309
310static void perf_group_attach(struct perf_event *event)
311{
312 struct perf_event *group_leader = event->group_leader;
313
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP;
316
317 if (group_leader == event)
318 return;
319
320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
321 !is_software_event(event))
322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
323
324 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++;
326}
327
319/* 328/*
320 * Remove a event from the lists for its context. 329 * Remove a event from the lists for its context.
321 * Must be called with ctx->mutex and ctx->lock held. 330 * Must be called with ctx->mutex and ctx->lock held.
@@ -323,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
323static void 332static void
324list_del_event(struct perf_event *event, struct perf_event_context *ctx) 333list_del_event(struct perf_event *event, struct perf_event_context *ctx)
325{ 334{
326 struct perf_event *sibling, *tmp; 335 /*
327 336 * We can have double detach due to exit/hot-unplug + close.
328 if (list_empty(&event->group_entry)) 337 */
338 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
329 return; 339 return;
340
341 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342
330 ctx->nr_events--; 343 ctx->nr_events--;
331 if (event->attr.inherit_stat) 344 if (event->attr.inherit_stat)
332 ctx->nr_stat--; 345 ctx->nr_stat--;
333 346
334 list_del_init(&event->group_entry);
335 list_del_rcu(&event->event_entry); 347 list_del_rcu(&event->event_entry);
336 348
337 if (event->group_leader != event) 349 if (event->group_leader == event)
338 event->group_leader->nr_siblings--; 350 list_del_init(&event->group_entry);
339 351
340 update_event_times(event); 352 update_group_times(event);
341 353
342 /* 354 /*
343 * If event was in error state, then keep it 355 * If event was in error state, then keep it
@@ -348,16 +360,45 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
348 */ 360 */
349 if (event->state > PERF_EVENT_STATE_OFF) 361 if (event->state > PERF_EVENT_STATE_OFF)
350 event->state = PERF_EVENT_STATE_OFF; 362 event->state = PERF_EVENT_STATE_OFF;
363}
364
365static void perf_group_detach(struct perf_event *event)
366{
367 struct perf_event *sibling, *tmp;
368 struct list_head *list = NULL;
369
370 /*
371 * We can have double detach due to exit/hot-unplug + close.
372 */
373 if (!(event->attach_state & PERF_ATTACH_GROUP))
374 return;
375
376 event->attach_state &= ~PERF_ATTACH_GROUP;
377
378 /*
379 * If this is a sibling, remove it from its group.
380 */
381 if (event->group_leader != event) {
382 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--;
384 return;
385 }
386
387 if (!list_empty(&event->group_entry))
388 list = &event->group_entry;
351 389
352 /* 390 /*
353 * If this was a group event with sibling events then 391 * If this was a group event with sibling events then
354 * upgrade the siblings to singleton events by adding them 392 * upgrade the siblings to singleton events by adding them
355 * to the context list directly: 393 * to whatever list we are on.
356 */ 394 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
358 396 if (list)
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 397 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 398 sibling->group_leader = sibling;
399
400 /* Inherit group flags from the previous leader */
401 sibling->group_flags = event->group_flags;
361 } 402 }
362} 403}
363 404
@@ -508,18 +549,6 @@ retry:
508} 549}
509 550
510/* 551/*
511 * Update total_time_enabled and total_time_running for all events in a group.
512 */
513static void update_group_times(struct perf_event *leader)
514{
515 struct perf_event *event;
516
517 update_event_times(leader);
518 list_for_each_entry(event, &leader->sibling_list, group_entry)
519 update_event_times(event);
520}
521
522/*
523 * Cross CPU call to disable a performance event 552 * Cross CPU call to disable a performance event
524 */ 553 */
525static void __perf_event_disable(void *info) 554static void __perf_event_disable(void *info)
@@ -608,14 +637,13 @@ void perf_event_disable(struct perf_event *event)
608static int 637static int
609event_sched_in(struct perf_event *event, 638event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 639 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 640 struct perf_event_context *ctx)
612 int cpu)
613{ 641{
614 if (event->state <= PERF_EVENT_STATE_OFF) 642 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 643 return 0;
616 644
617 event->state = PERF_EVENT_STATE_ACTIVE; 645 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 646 event->oncpu = smp_processor_id();
619 /* 647 /*
620 * The new state must be visible before we turn it on in the hardware: 648 * The new state must be visible before we turn it on in the hardware:
621 */ 649 */
@@ -642,33 +670,47 @@ event_sched_in(struct perf_event *event,
642static int 670static int
643group_sched_in(struct perf_event *group_event, 671group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 672 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 673 struct perf_event_context *ctx)
646 int cpu)
647{ 674{
648 struct perf_event *event, *partial_group; 675 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu;
677 bool txn = false;
649 int ret; 678 int ret;
650 679
651 if (group_event->state == PERF_EVENT_STATE_OFF) 680 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 681 return 0;
653 682
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 683 /* Check if group transaction availabe */
655 if (ret) 684 if (pmu->start_txn)
656 return ret < 0 ? ret : 0; 685 txn = true;
657 686
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 687 if (txn)
688 pmu->start_txn(pmu);
689
690 if (event_sched_in(group_event, cpuctx, ctx)) {
691 if (txn)
692 pmu->cancel_txn(pmu);
659 return -EAGAIN; 693 return -EAGAIN;
694 }
660 695
661 /* 696 /*
662 * Schedule in siblings as one group (if any): 697 * Schedule in siblings as one group (if any):
663 */ 698 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 699 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 700 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 701 partial_group = event;
667 goto group_error; 702 goto group_error;
668 } 703 }
669 } 704 }
670 705
671 return 0; 706 if (!txn)
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0;
713 }
672 714
673group_error: 715group_error:
674 /* 716 /*
@@ -682,25 +724,10 @@ group_error:
682 } 724 }
683 event_sched_out(group_event, cpuctx, ctx); 725 event_sched_out(group_event, cpuctx, ctx);
684 726
685 return -EAGAIN; 727 if (txn)
686} 728 pmu->cancel_txn(pmu);
687
688/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702 729
703 return 1; 730 return -EAGAIN;
704} 731}
705 732
706/* 733/*
@@ -713,7 +740,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 740 /*
714 * Groups consisting entirely of software events can always go on. 741 * Groups consisting entirely of software events can always go on.
715 */ 742 */
716 if (is_software_only_group(event)) 743 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 744 return 1;
718 /* 745 /*
719 * If an exclusive group is already on, no other hardware 746 * If an exclusive group is already on, no other hardware
@@ -738,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
738 struct perf_event_context *ctx) 765 struct perf_event_context *ctx)
739{ 766{
740 list_add_event(event, ctx); 767 list_add_event(event, ctx);
768 perf_group_attach(event);
741 event->tstamp_enabled = ctx->time; 769 event->tstamp_enabled = ctx->time;
742 event->tstamp_running = ctx->time; 770 event->tstamp_running = ctx->time;
743 event->tstamp_stopped = ctx->time; 771 event->tstamp_stopped = ctx->time;
@@ -754,7 +782,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 782 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 783 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 784 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 785 int err;
759 786
760 /* 787 /*
@@ -801,7 +828,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 828 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 829 err = -EEXIST;
803 else 830 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 831 err = event_sched_in(event, cpuctx, ctx);
805 832
806 if (err) { 833 if (err) {
807 /* 834 /*
@@ -943,11 +970,9 @@ static void __perf_event_enable(void *info)
943 } else { 970 } else {
944 perf_disable(); 971 perf_disable();
945 if (event == leader) 972 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 973 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 974 else
949 err = event_sched_in(event, cpuctx, ctx, 975 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 976 perf_enable();
952 } 977 }
953 978
@@ -1043,8 +1068,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1068 return 0;
1044} 1069}
1045 1070
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1071enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1072 EVENT_FLEXIBLE = 0x1,
1073 EVENT_PINNED = 0x2,
1074 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1075};
1076
1077static void ctx_sched_out(struct perf_event_context *ctx,
1078 struct perf_cpu_context *cpuctx,
1079 enum event_type_t event_type)
1048{ 1080{
1049 struct perf_event *event; 1081 struct perf_event *event;
1050 1082
@@ -1055,10 +1087,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1087 update_context_time(ctx);
1056 1088
1057 perf_disable(); 1089 perf_disable();
1058 if (ctx->nr_active) { 1090 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1091 goto out_enable;
1092
1093 if (event_type & EVENT_PINNED)
1094 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1095 group_sched_out(event, cpuctx, ctx);
1061 } 1096
1097 if (event_type & EVENT_FLEXIBLE)
1098 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1099 group_sched_out(event, cpuctx, ctx);
1100
1101 out_enable:
1062 perf_enable(); 1102 perf_enable();
1063 out: 1103 out:
1064 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
@@ -1170,17 +1210,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1210 * not restart the event.
1171 */ 1211 */
1172void perf_event_task_sched_out(struct task_struct *task, 1212void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1213 struct task_struct *next)
1174{ 1214{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1215 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1216 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1217 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1218 struct perf_event_context *parent;
1179 struct pt_regs *regs;
1180 int do_switch = 1; 1219 int do_switch = 1;
1181 1220
1182 regs = task_pt_regs(task); 1221 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1183 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1184 1222
1185 if (likely(!ctx || !cpuctx->task_ctx)) 1223 if (likely(!ctx || !cpuctx->task_ctx))
1186 return; 1224 return;
@@ -1220,15 +1258,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1258 rcu_read_unlock();
1221 1259
1222 if (do_switch) { 1260 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1261 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1262 cpuctx->task_ctx = NULL;
1225 } 1263 }
1226} 1264}
1227 1265
1228/* 1266static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1267 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1268{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1269 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1270
@@ -1238,47 +1274,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1274 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1275 return;
1240 1276
1241 __perf_event_sched_out(ctx, cpuctx); 1277 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1278 cpuctx->task_ctx = NULL;
1243} 1279}
1244 1280
1245/* 1281/*
1246 * Called with IRQs disabled 1282 * Called with IRQs disabled
1247 */ 1283 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1284static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1249{ 1285{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1286 task_ctx_sched_out(ctx, EVENT_ALL);
1287}
1288
1289/*
1290 * Called with IRQs disabled
1291 */
1292static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1293 enum event_type_t event_type)
1294{
1295 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1296}
1252 1297
1253static void 1298static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1299ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1300 struct perf_cpu_context *cpuctx)
1256{ 1301{
1257 struct perf_event *event; 1302 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264
1265 ctx->timestamp = perf_clock();
1266 1303
1267 perf_disable(); 1304 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1268 1305 if (event->state <= PERF_EVENT_STATE_OFF)
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1306 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1307 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1308 continue;
1279 1309
1280 if (group_can_go_on(event, cpuctx, 1)) 1310 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1311 group_sched_in(event, cpuctx, ctx);
1282 1312
1283 /* 1313 /*
1284 * If this pinned group hasn't been scheduled, 1314 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1319,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1319 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1320 }
1291 } 1321 }
1322}
1292 1323
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1324static void
1294 /* 1325ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1326 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1327{
1297 */ 1328 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1329 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1330
1331 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1332 /* Ignore events in OFF or ERROR state */
1333 if (event->state <= PERF_EVENT_STATE_OFF)
1334 continue;
1302 /* 1335 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1336 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1337 * of events:
1305 */ 1338 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1339 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1340 continue;
1308 1341
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1342 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1343 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1344 can_add_hw = 0;
1312 } 1345 }
1346}
1347
1348static void
1349ctx_sched_in(struct perf_event_context *ctx,
1350 struct perf_cpu_context *cpuctx,
1351 enum event_type_t event_type)
1352{
1353 raw_spin_lock(&ctx->lock);
1354 ctx->is_active = 1;
1355 if (likely(!ctx->nr_events))
1356 goto out;
1357
1358 ctx->timestamp = perf_clock();
1359
1360 perf_disable();
1361
1362 /*
1363 * First go through the list and put on any pinned groups
1364 * in order to give them the best chance of going on.
1365 */
1366 if (event_type & EVENT_PINNED)
1367 ctx_pinned_sched_in(ctx, cpuctx);
1368
1369 /* Then walk through the lower prio flexible groups */
1370 if (event_type & EVENT_FLEXIBLE)
1371 ctx_flexible_sched_in(ctx, cpuctx);
1372
1313 perf_enable(); 1373 perf_enable();
1314 out: 1374 out:
1315 raw_spin_unlock(&ctx->lock); 1375 raw_spin_unlock(&ctx->lock);
1316} 1376}
1317 1377
1378static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1379 enum event_type_t event_type)
1380{
1381 struct perf_event_context *ctx = &cpuctx->ctx;
1382
1383 ctx_sched_in(ctx, cpuctx, event_type);
1384}
1385
1386static void task_ctx_sched_in(struct task_struct *task,
1387 enum event_type_t event_type)
1388{
1389 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1390 struct perf_event_context *ctx = task->perf_event_ctxp;
1391
1392 if (likely(!ctx))
1393 return;
1394 if (cpuctx->task_ctx == ctx)
1395 return;
1396 ctx_sched_in(ctx, cpuctx, event_type);
1397 cpuctx->task_ctx = ctx;
1398}
1318/* 1399/*
1319 * Called from scheduler to add the events of the current task 1400 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1401 * with interrupts disabled.
@@ -1326,38 +1407,135 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1407 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1408 * keep the event running.
1328 */ 1409 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1410void perf_event_task_sched_in(struct task_struct *task)
1330{ 1411{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1412 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1413 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1414
1334 if (likely(!ctx)) 1415 if (likely(!ctx))
1335 return; 1416 return;
1417
1336 if (cpuctx->task_ctx == ctx) 1418 if (cpuctx->task_ctx == ctx)
1337 return; 1419 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1420
1421 perf_disable();
1422
1423 /*
1424 * We want to keep the following priority order:
1425 * cpu pinned (that don't need to move), task pinned,
1426 * cpu flexible, task flexible.
1427 */
1428 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1429
1430 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1431 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1432 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1433
1339 cpuctx->task_ctx = ctx; 1434 cpuctx->task_ctx = ctx;
1435
1436 perf_enable();
1340} 1437}
1341 1438
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1439#define MAX_INTERRUPTS (~0ULL)
1440
1441static void perf_log_throttle(struct perf_event *event, int enable);
1442
1443static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1444{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1445 u64 frequency = event->attr.sample_freq;
1446 u64 sec = NSEC_PER_SEC;
1447 u64 divisor, dividend;
1345 1448
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1449 int count_fls, nsec_fls, frequency_fls, sec_fls;
1450
1451 count_fls = fls64(count);
1452 nsec_fls = fls64(nsec);
1453 frequency_fls = fls64(frequency);
1454 sec_fls = 30;
1455
1456 /*
1457 * We got @count in @nsec, with a target of sample_freq HZ
1458 * the target period becomes:
1459 *
1460 * @count * 10^9
1461 * period = -------------------
1462 * @nsec * sample_freq
1463 *
1464 */
1465
1466 /*
1467 * Reduce accuracy by one bit such that @a and @b converge
1468 * to a similar magnitude.
1469 */
1470#define REDUCE_FLS(a, b) \
1471do { \
1472 if (a##_fls > b##_fls) { \
1473 a >>= 1; \
1474 a##_fls--; \
1475 } else { \
1476 b >>= 1; \
1477 b##_fls--; \
1478 } \
1479} while (0)
1480
1481 /*
1482 * Reduce accuracy until either term fits in a u64, then proceed with
1483 * the other, so that finally we can do a u64/u64 division.
1484 */
1485 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1486 REDUCE_FLS(nsec, frequency);
1487 REDUCE_FLS(sec, count);
1488 }
1489
1490 if (count_fls + sec_fls > 64) {
1491 divisor = nsec * frequency;
1492
1493 while (count_fls + sec_fls > 64) {
1494 REDUCE_FLS(count, sec);
1495 divisor >>= 1;
1496 }
1497
1498 dividend = count * sec;
1499 } else {
1500 dividend = count * sec;
1501
1502 while (nsec_fls + frequency_fls > 64) {
1503 REDUCE_FLS(nsec, frequency);
1504 dividend >>= 1;
1505 }
1506
1507 divisor = nsec * frequency;
1508 }
1509
1510 if (!divisor)
1511 return dividend;
1512
1513 return div64_u64(dividend, divisor);
1347} 1514}
1348 1515
1349#define MAX_INTERRUPTS (~0ULL) 1516static void perf_event_stop(struct perf_event *event)
1517{
1518 if (!event->pmu->stop)
1519 return event->pmu->disable(event);
1350 1520
1351static void perf_log_throttle(struct perf_event *event, int enable); 1521 return event->pmu->stop(event);
1522}
1523
1524static int perf_event_start(struct perf_event *event)
1525{
1526 if (!event->pmu->start)
1527 return event->pmu->enable(event);
1528
1529 return event->pmu->start(event);
1530}
1352 1531
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1532static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1533{
1355 struct hw_perf_event *hwc = &event->hw; 1534 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1535 s64 period, sample_period;
1357 s64 delta; 1536 s64 delta;
1358 1537
1359 events *= hwc->sample_period; 1538 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1539
1362 delta = (s64)(period - hwc->sample_period); 1540 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1541 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1546,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1546 sample_period = 1;
1369 1547
1370 hwc->sample_period = sample_period; 1548 hwc->sample_period = sample_period;
1549
1550 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1551 perf_disable();
1552 perf_event_stop(event);
1553 atomic64_set(&hwc->period_left, 0);
1554 perf_event_start(event);
1555 perf_enable();
1556 }
1371} 1557}
1372 1558
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1559static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1560{
1375 struct perf_event *event; 1561 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1562 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1563 u64 interrupts, now;
1564 s64 delta;
1378 1565
1379 raw_spin_lock(&ctx->lock); 1566 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1567 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1394,45 +1581,23 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1394 */ 1581 */
1395 if (interrupts == MAX_INTERRUPTS) { 1582 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1583 perf_log_throttle(event, 1);
1584 perf_disable();
1397 event->pmu->unthrottle(event); 1585 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ; 1586 perf_enable();
1399 } 1587 }
1400 1588
1401 if (!event->attr.freq || !event->attr.sample_freq) 1589 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1590 continue;
1403 1591
1404 /* 1592 perf_disable();
1405 * if the specified freq < HZ then we need to skip ticks 1593 event->pmu->read(event);
1406 */ 1594 now = atomic64_read(&event->count);
1407 if (event->attr.sample_freq < HZ) { 1595 delta = now - hwc->freq_count_stamp;
1408 freq = event->attr.sample_freq; 1596 hwc->freq_count_stamp = now;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1597
1424 /* 1598 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1599 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't 1600 perf_enable();
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1601 }
1437 raw_spin_unlock(&ctx->lock); 1602 raw_spin_unlock(&ctx->lock);
1438} 1603}
@@ -1442,51 +1607,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1607 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1608static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1609{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events)
1448 return;
1449
1450 raw_spin_lock(&ctx->lock); 1610 raw_spin_lock(&ctx->lock);
1451 /* 1611
1452 * Rotate the first entry last (works just fine for group events too): 1612 /* Rotate the first entry last of non-pinned groups */
1453 */ 1613 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1614
1461 raw_spin_unlock(&ctx->lock); 1615 raw_spin_unlock(&ctx->lock);
1462} 1616}
1463 1617
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1618void perf_event_task_tick(struct task_struct *curr)
1465{ 1619{
1466 struct perf_cpu_context *cpuctx; 1620 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1621 struct perf_event_context *ctx;
1622 int rotate = 0;
1468 1623
1469 if (!atomic_read(&nr_events)) 1624 if (!atomic_read(&nr_events))
1470 return; 1625 return;
1471 1626
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1627 cpuctx = &__get_cpu_var(perf_cpu_context);
1628 if (cpuctx->ctx.nr_events &&
1629 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1630 rotate = 1;
1631
1473 ctx = curr->perf_event_ctxp; 1632 ctx = curr->perf_event_ctxp;
1633 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1634 rotate = 1;
1474 1635
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1636 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1637 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1638 perf_ctx_adjust_freq(ctx);
1478 1639
1479 perf_event_cpu_sched_out(cpuctx); 1640 if (!rotate)
1641 return;
1642
1643 perf_disable();
1644 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1645 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1646 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1647
1483 rotate_ctx(&cpuctx->ctx); 1648 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1649 if (ctx)
1485 rotate_ctx(ctx); 1650 rotate_ctx(ctx);
1486 1651
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1652 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1653 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1654 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1655 perf_enable();
1656}
1657
1658static int event_enable_on_exec(struct perf_event *event,
1659 struct perf_event_context *ctx)
1660{
1661 if (!event->attr.enable_on_exec)
1662 return 0;
1663
1664 event->attr.enable_on_exec = 0;
1665 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1666 return 0;
1667
1668 __perf_event_mark_enabled(event, ctx);
1669
1670 return 1;
1490} 1671}
1491 1672
1492/* 1673/*
@@ -1499,6 +1680,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1680 struct perf_event *event;
1500 unsigned long flags; 1681 unsigned long flags;
1501 int enabled = 0; 1682 int enabled = 0;
1683 int ret;
1502 1684
1503 local_irq_save(flags); 1685 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1686 ctx = task->perf_event_ctxp;
@@ -1509,14 +1691,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1691
1510 raw_spin_lock(&ctx->lock); 1692 raw_spin_lock(&ctx->lock);
1511 1693
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1694 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1695 ret = event_enable_on_exec(event, ctx);
1514 continue; 1696 if (ret)
1515 event->attr.enable_on_exec = 0; 1697 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1698 }
1517 continue; 1699
1518 __perf_event_mark_enabled(event, ctx); 1700 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1701 ret = event_enable_on_exec(event, ctx);
1702 if (ret)
1703 enabled = 1;
1520 } 1704 }
1521 1705
1522 /* 1706 /*
@@ -1527,7 +1711,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1711
1528 raw_spin_unlock(&ctx->lock); 1712 raw_spin_unlock(&ctx->lock);
1529 1713
1530 perf_event_task_sched_in(task, smp_processor_id()); 1714 perf_event_task_sched_in(task);
1531 out: 1715 out:
1532 local_irq_restore(flags); 1716 local_irq_restore(flags);
1533} 1717}
@@ -1590,7 +1774,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1774{
1591 raw_spin_lock_init(&ctx->lock); 1775 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1776 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1777 INIT_LIST_HEAD(&ctx->pinned_groups);
1778 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1779 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1780 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1781 ctx->task = task;
@@ -1698,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
1698} 1883}
1699 1884
1700static void perf_pending_sync(struct perf_event *event); 1885static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data);
1701 1887
1702static void free_event(struct perf_event *event) 1888static void free_event(struct perf_event *event)
1703{ 1889{
@@ -1713,9 +1899,9 @@ static void free_event(struct perf_event *event)
1713 atomic_dec(&nr_task_events); 1899 atomic_dec(&nr_task_events);
1714 } 1900 }
1715 1901
1716 if (event->output) { 1902 if (event->data) {
1717 fput(event->output->filp); 1903 perf_mmap_data_put(event->data);
1718 event->output = NULL; 1904 event->data = NULL;
1719 } 1905 }
1720 1906
1721 if (event->destroy) 1907 if (event->destroy)
@@ -1729,9 +1915,30 @@ int perf_event_release_kernel(struct perf_event *event)
1729{ 1915{
1730 struct perf_event_context *ctx = event->ctx; 1916 struct perf_event_context *ctx = event->ctx;
1731 1917
1918 /*
1919 * Remove from the PMU, can't get re-enabled since we got
1920 * here because the last ref went.
1921 */
1922 perf_event_disable(event);
1923
1732 WARN_ON_ONCE(ctx->parent_ctx); 1924 WARN_ON_ONCE(ctx->parent_ctx);
1733 mutex_lock(&ctx->mutex); 1925 /*
1734 perf_event_remove_from_context(event); 1926 * There are two ways this annotation is useful:
1927 *
1928 * 1) there is a lock recursion from perf_event_exit_task
1929 * see the comment there.
1930 *
1931 * 2) there is a lock-inversion with mmap_sem through
1932 * perf_event_read_group(), which takes faults while
1933 * holding ctx->mutex, however this is called after
1934 * the last filedesc died, so there is no possibility
1935 * to trigger the AB-BA case.
1936 */
1937 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1938 raw_spin_lock_irq(&ctx->lock);
1939 perf_group_detach(event);
1940 list_del_event(event, ctx);
1941 raw_spin_unlock_irq(&ctx->lock);
1735 mutex_unlock(&ctx->mutex); 1942 mutex_unlock(&ctx->mutex);
1736 1943
1737 mutex_lock(&event->owner->perf_event_mutex); 1944 mutex_lock(&event->owner->perf_event_mutex);
@@ -2011,7 +2218,27 @@ unlock:
2011 return ret; 2218 return ret;
2012} 2219}
2013 2220
2014static int perf_event_set_output(struct perf_event *event, int output_fd); 2221static const struct file_operations perf_fops;
2222
2223static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2224{
2225 struct file *file;
2226
2227 file = fget_light(fd, fput_needed);
2228 if (!file)
2229 return ERR_PTR(-EBADF);
2230
2231 if (file->f_op != &perf_fops) {
2232 fput_light(file, *fput_needed);
2233 *fput_needed = 0;
2234 return ERR_PTR(-EBADF);
2235 }
2236
2237 return file->private_data;
2238}
2239
2240static int perf_event_set_output(struct perf_event *event,
2241 struct perf_event *output_event);
2015static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2242static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2016 2243
2017static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2244static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2038,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2038 return perf_event_period(event, (u64 __user *)arg); 2265 return perf_event_period(event, (u64 __user *)arg);
2039 2266
2040 case PERF_EVENT_IOC_SET_OUTPUT: 2267 case PERF_EVENT_IOC_SET_OUTPUT:
2041 return perf_event_set_output(event, arg); 2268 {
2269 struct perf_event *output_event = NULL;
2270 int fput_needed = 0;
2271 int ret;
2272
2273 if (arg != -1) {
2274 output_event = perf_fget_light(arg, &fput_needed);
2275 if (IS_ERR(output_event))
2276 return PTR_ERR(output_event);
2277 }
2278
2279 ret = perf_event_set_output(event, output_event);
2280 if (output_event)
2281 fput_light(output_event->filp, fput_needed);
2282
2283 return ret;
2284 }
2042 2285
2043 case PERF_EVENT_IOC_SET_FILTER: 2286 case PERF_EVENT_IOC_SET_FILTER:
2044 return perf_event_set_filter(event, (void __user *)arg); 2287 return perf_event_set_filter(event, (void __user *)arg);
@@ -2133,11 +2376,6 @@ unlock:
2133 rcu_read_unlock(); 2376 rcu_read_unlock();
2134} 2377}
2135 2378
2136static unsigned long perf_data_size(struct perf_mmap_data *data)
2137{
2138 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2139}
2140
2141#ifndef CONFIG_PERF_USE_VMALLOC 2379#ifndef CONFIG_PERF_USE_VMALLOC
2142 2380
2143/* 2381/*
@@ -2156,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2156 return virt_to_page(data->data_pages[pgoff - 1]); 2394 return virt_to_page(data->data_pages[pgoff - 1]);
2157} 2395}
2158 2396
2397static void *perf_mmap_alloc_page(int cpu)
2398{
2399 struct page *page;
2400 int node;
2401
2402 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2403 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2404 if (!page)
2405 return NULL;
2406
2407 return page_address(page);
2408}
2409
2159static struct perf_mmap_data * 2410static struct perf_mmap_data *
2160perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2161{ 2412{
@@ -2163,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2163 unsigned long size; 2414 unsigned long size;
2164 int i; 2415 int i;
2165 2416
2166 WARN_ON(atomic_read(&event->mmap_count));
2167
2168 size = sizeof(struct perf_mmap_data); 2417 size = sizeof(struct perf_mmap_data);
2169 size += nr_pages * sizeof(void *); 2418 size += nr_pages * sizeof(void *);
2170 2419
@@ -2172,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2172 if (!data) 2421 if (!data)
2173 goto fail; 2422 goto fail;
2174 2423
2175 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2424 data->user_page = perf_mmap_alloc_page(event->cpu);
2176 if (!data->user_page) 2425 if (!data->user_page)
2177 goto fail_user_page; 2426 goto fail_user_page;
2178 2427
2179 for (i = 0; i < nr_pages; i++) { 2428 for (i = 0; i < nr_pages; i++) {
2180 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2181 if (!data->data_pages[i]) 2430 if (!data->data_pages[i])
2182 goto fail_data_pages; 2431 goto fail_data_pages;
2183 } 2432 }
2184 2433
2185 data->data_order = 0;
2186 data->nr_pages = nr_pages; 2434 data->nr_pages = nr_pages;
2187 2435
2188 return data; 2436 return data;
@@ -2218,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2218 kfree(data); 2466 kfree(data);
2219} 2467}
2220 2468
2469static inline int page_order(struct perf_mmap_data *data)
2470{
2471 return 0;
2472}
2473
2221#else 2474#else
2222 2475
2223/* 2476/*
@@ -2226,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2226 * Required for architectures that have d-cache aliasing issues. 2479 * Required for architectures that have d-cache aliasing issues.
2227 */ 2480 */
2228 2481
2482static inline int page_order(struct perf_mmap_data *data)
2483{
2484 return data->page_order;
2485}
2486
2229static struct page * 2487static struct page *
2230perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2231{ 2489{
2232 if (pgoff > (1UL << data->data_order)) 2490 if (pgoff > (1UL << page_order(data)))
2233 return NULL; 2491 return NULL;
2234 2492
2235 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2249,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2249 int i, nr; 2507 int i, nr;
2250 2508
2251 data = container_of(work, struct perf_mmap_data, work); 2509 data = container_of(work, struct perf_mmap_data, work);
2252 nr = 1 << data->data_order; 2510 nr = 1 << page_order(data);
2253 2511
2254 base = data->user_page; 2512 base = data->user_page;
2255 for (i = 0; i < nr + 1; i++) 2513 for (i = 0; i < nr + 1; i++)
@@ -2271,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2271 unsigned long size; 2529 unsigned long size;
2272 void *all_buf; 2530 void *all_buf;
2273 2531
2274 WARN_ON(atomic_read(&event->mmap_count));
2275
2276 size = sizeof(struct perf_mmap_data); 2532 size = sizeof(struct perf_mmap_data);
2277 size += sizeof(void *); 2533 size += sizeof(void *);
2278 2534
@@ -2288,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2288 2544
2289 data->user_page = all_buf; 2545 data->user_page = all_buf;
2290 data->data_pages[0] = all_buf + PAGE_SIZE; 2546 data->data_pages[0] = all_buf + PAGE_SIZE;
2291 data->data_order = ilog2(nr_pages); 2547 data->page_order = ilog2(nr_pages);
2292 data->nr_pages = 1; 2548 data->nr_pages = 1;
2293 2549
2294 return data; 2550 return data;
@@ -2302,6 +2558,11 @@ fail:
2302 2558
2303#endif 2559#endif
2304 2560
2561static unsigned long perf_data_size(struct perf_mmap_data *data)
2562{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2564}
2565
2305static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2306{ 2567{
2307 struct perf_event *event = vma->vm_file->private_data; 2568 struct perf_event *event = vma->vm_file->private_data;
@@ -2342,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2342{ 2603{
2343 long max_size = perf_data_size(data); 2604 long max_size = perf_data_size(data);
2344 2605
2345 atomic_set(&data->lock, -1);
2346
2347 if (event->attr.watermark) { 2606 if (event->attr.watermark) {
2348 data->watermark = min_t(long, max_size, 2607 data->watermark = min_t(long, max_size,
2349 event->attr.wakeup_watermark); 2608 event->attr.wakeup_watermark);
@@ -2352,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2352 if (!data->watermark) 2611 if (!data->watermark)
2353 data->watermark = max_size / 2; 2612 data->watermark = max_size / 2;
2354 2613
2355 2614 atomic_set(&data->refcount, 1);
2356 rcu_assign_pointer(event->data, data); 2615 rcu_assign_pointer(event->data, data);
2357} 2616}
2358 2617
@@ -2364,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2364 perf_mmap_data_free(data); 2623 perf_mmap_data_free(data);
2365} 2624}
2366 2625
2367static void perf_mmap_data_release(struct perf_event *event) 2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2368{ 2627{
2369 struct perf_mmap_data *data = event->data; 2628 struct perf_mmap_data *data;
2629
2630 rcu_read_lock();
2631 data = rcu_dereference(event->data);
2632 if (data) {
2633 if (!atomic_inc_not_zero(&data->refcount))
2634 data = NULL;
2635 }
2636 rcu_read_unlock();
2637
2638 return data;
2639}
2370 2640
2371 WARN_ON(atomic_read(&event->mmap_count)); 2641static void perf_mmap_data_put(struct perf_mmap_data *data)
2642{
2643 if (!atomic_dec_and_test(&data->refcount))
2644 return;
2372 2645
2373 rcu_assign_pointer(event->data, NULL);
2374 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2375} 2647}
2376 2648
@@ -2385,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2385{ 2657{
2386 struct perf_event *event = vma->vm_file->private_data; 2658 struct perf_event *event = vma->vm_file->private_data;
2387 2659
2388 WARN_ON_ONCE(event->ctx->parent_ctx);
2389 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2390 unsigned long size = perf_data_size(event->data); 2661 unsigned long size = perf_data_size(event->data);
2391 struct user_struct *user = current_user(); 2662 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data;
2392 2664
2393 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2394 vma->vm_mm->locked_vm -= event->data->nr_locked; 2666 vma->vm_mm->locked_vm -= event->mmap_locked;
2395 perf_mmap_data_release(event); 2667 rcu_assign_pointer(event->data, NULL);
2396 mutex_unlock(&event->mmap_mutex); 2668 mutex_unlock(&event->mmap_mutex);
2669
2670 perf_mmap_data_put(data);
2671 free_uid(user);
2397 } 2672 }
2398} 2673}
2399 2674
@@ -2416,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2416 long user_extra, extra; 2691 long user_extra, extra;
2417 int ret = 0; 2692 int ret = 0;
2418 2693
2694 /*
2695 * Don't allow mmap() of inherited per-task counters. This would
2696 * create a performance issue due to all children writing to the
2697 * same buffer.
2698 */
2699 if (event->cpu == -1 && event->attr.inherit)
2700 return -EINVAL;
2701
2419 if (!(vma->vm_flags & VM_SHARED)) 2702 if (!(vma->vm_flags & VM_SHARED))
2420 return -EINVAL; 2703 return -EINVAL;
2421 2704
@@ -2437,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2437 2720
2438 WARN_ON_ONCE(event->ctx->parent_ctx); 2721 WARN_ON_ONCE(event->ctx->parent_ctx);
2439 mutex_lock(&event->mmap_mutex); 2722 mutex_lock(&event->mmap_mutex);
2440 if (event->output) { 2723 if (event->data) {
2441 ret = -EINVAL; 2724 if (event->data->nr_pages == nr_pages)
2442 goto unlock; 2725 atomic_inc(&event->data->refcount);
2443 } 2726 else
2444
2445 if (atomic_inc_not_zero(&event->mmap_count)) {
2446 if (nr_pages != event->data->nr_pages)
2447 ret = -EINVAL; 2727 ret = -EINVAL;
2448 goto unlock; 2728 goto unlock;
2449 } 2729 }
@@ -2462,7 +2742,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2462 if (user_locked > user_lock_limit) 2742 if (user_locked > user_lock_limit)
2463 extra = user_locked - user_lock_limit; 2743 extra = user_locked - user_lock_limit;
2464 2744
2465 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2745 lock_limit = rlimit(RLIMIT_MEMLOCK);
2466 lock_limit >>= PAGE_SHIFT; 2746 lock_limit >>= PAGE_SHIFT;
2467 locked = vma->vm_mm->locked_vm + extra; 2747 locked = vma->vm_mm->locked_vm + extra;
2468 2748
@@ -2475,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2475 WARN_ON(event->data); 2755 WARN_ON(event->data);
2476 2756
2477 data = perf_mmap_data_alloc(event, nr_pages); 2757 data = perf_mmap_data_alloc(event, nr_pages);
2478 ret = -ENOMEM; 2758 if (!data) {
2479 if (!data) 2759 ret = -ENOMEM;
2480 goto unlock; 2760 goto unlock;
2761 }
2481 2762
2482 ret = 0;
2483 perf_mmap_data_init(event, data); 2763 perf_mmap_data_init(event, data);
2484
2485 atomic_set(&event->mmap_count, 1);
2486 atomic_long_add(user_extra, &user->locked_vm);
2487 vma->vm_mm->locked_vm += extra;
2488 event->data->nr_locked = extra;
2489 if (vma->vm_flags & VM_WRITE) 2764 if (vma->vm_flags & VM_WRITE)
2490 event->data->writable = 1; 2765 event->data->writable = 1;
2491 2766
2767 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra;
2769 event->mmap_user = get_current_user();
2770 vma->vm_mm->locked_vm += event->mmap_locked;
2771
2492unlock: 2772unlock:
2773 if (!ret)
2774 atomic_inc(&event->mmap_count);
2493 mutex_unlock(&event->mmap_mutex); 2775 mutex_unlock(&event->mmap_mutex);
2494 2776
2495 vma->vm_flags |= VM_RESERVED; 2777 vma->vm_flags |= VM_RESERVED;
@@ -2515,6 +2797,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2515} 2797}
2516 2798
2517static const struct file_operations perf_fops = { 2799static const struct file_operations perf_fops = {
2800 .llseek = no_llseek,
2518 .release = perf_release, 2801 .release = perf_release,
2519 .read = perf_read, 2802 .read = perf_read,
2520 .poll = perf_poll, 2803 .poll = perf_poll,
@@ -2658,6 +2941,33 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2658 return NULL; 2941 return NULL;
2659} 2942}
2660 2943
2944__weak
2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2946{
2947}
2948
2949
2950/*
2951 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is
2953 * another virtualization implementation supporting the callbacks.
2954 */
2955struct perf_guest_info_callbacks *perf_guest_cbs;
2956
2957int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2958{
2959 perf_guest_cbs = cbs;
2960 return 0;
2961}
2962EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2963
2964int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2965{
2966 perf_guest_cbs = NULL;
2967 return 0;
2968}
2969EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2970
2661/* 2971/*
2662 * Output 2972 * Output
2663 */ 2973 */
@@ -2693,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2693} 3003}
2694 3004
2695/* 3005/*
2696 * Curious locking construct.
2697 *
2698 * We need to ensure a later event_id doesn't publish a head when a former 3006 * We need to ensure a later event_id doesn't publish a head when a former
2699 * event_id isn't done writing. However since we need to deal with NMIs we 3007 * event isn't done writing. However since we need to deal with NMIs we
2700 * cannot fully serialize things. 3008 * cannot fully serialize things.
2701 * 3009 *
2702 * What we do is serialize between CPUs so we only have to deal with NMI
2703 * nesting on a single CPU.
2704 *
2705 * We only publish the head (and generate a wakeup) when the outer-most 3010 * We only publish the head (and generate a wakeup) when the outer-most
2706 * event_id completes. 3011 * event completes.
2707 */ 3012 */
2708static void perf_output_lock(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
2709{ 3014{
2710 struct perf_mmap_data *data = handle->data; 3015 struct perf_mmap_data *data = handle->data;
2711 int cur, cpu = get_cpu();
2712
2713 handle->locked = 0;
2714 3016
2715 for (;;) { 3017 preempt_disable();
2716 cur = atomic_cmpxchg(&data->lock, -1, cpu); 3018 local_inc(&data->nest);
2717 if (cur == -1) { 3019 handle->wakeup = local_read(&data->wakeup);
2718 handle->locked = 1;
2719 break;
2720 }
2721 if (cur == cpu)
2722 break;
2723
2724 cpu_relax();
2725 }
2726} 3020}
2727 3021
2728static void perf_output_unlock(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
2729{ 3023{
2730 struct perf_mmap_data *data = handle->data; 3024 struct perf_mmap_data *data = handle->data;
2731 unsigned long head; 3025 unsigned long head;
2732 int cpu;
2733
2734 data->done_head = data->head;
2735
2736 if (!handle->locked)
2737 goto out;
2738 3026
2739again: 3027again:
2740 /* 3028 head = local_read(&data->head);
2741 * The xchg implies a full barrier that ensures all writes are done
2742 * before we publish the new head, matched by a rmb() in userspace when
2743 * reading this position.
2744 */
2745 while ((head = atomic_long_xchg(&data->done_head, 0)))
2746 data->user_page->data_head = head;
2747 3029
2748 /* 3030 /*
2749 * NMI can happen here, which means we can miss a done_head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
2750 */ 3032 */
2751 3033
2752 cpu = atomic_xchg(&data->lock, -1); 3034 if (!local_dec_and_test(&data->nest))
2753 WARN_ON_ONCE(cpu != smp_processor_id()); 3035 goto out;
2754 3036
2755 /* 3037 /*
2756 * Therefore we have to validate we did not indeed do so. 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this
3040 * write.
2757 */ 3041 */
2758 if (unlikely(atomic_long_read(&data->done_head))) { 3042 data->user_page->data_head = head;
2759 /*
2760 * Since we had it locked, we can lock it again.
2761 */
2762 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2763 cpu_relax();
2764 3043
3044 /*
3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head.
3047 */
3048 if (unlikely(head != local_read(&data->head))) {
3049 local_inc(&data->nest);
2765 goto again; 3050 goto again;
2766 } 3051 }
2767 3052
2768 if (atomic_xchg(&data->wakeup, 0)) 3053 if (handle->wakeup != local_read(&data->wakeup))
2769 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
2770out: 3055
2771 put_cpu(); 3056 out:
3057 preempt_enable();
2772} 3058}
2773 3059
2774void perf_output_copy(struct perf_output_handle *handle, 3060__always_inline void perf_output_copy(struct perf_output_handle *handle,
2775 const void *buf, unsigned int len) 3061 const void *buf, unsigned int len)
2776{ 3062{
2777 unsigned int pages_mask;
2778 unsigned long offset;
2779 unsigned int size;
2780 void **pages;
2781
2782 offset = handle->offset;
2783 pages_mask = handle->data->nr_pages - 1;
2784 pages = handle->data->data_pages;
2785
2786 do { 3063 do {
2787 unsigned long page_offset; 3064 unsigned long size = min_t(unsigned long, handle->size, len);
2788 unsigned long page_size;
2789 int nr;
2790 3065
2791 nr = (offset >> PAGE_SHIFT) & pages_mask; 3066 memcpy(handle->addr, buf, size);
2792 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2793 page_offset = offset & (page_size - 1);
2794 size = min_t(unsigned int, page_size - page_offset, len);
2795 3067
2796 memcpy(pages[nr] + page_offset, buf, size); 3068 len -= size;
3069 handle->addr += size;
3070 buf += size;
3071 handle->size -= size;
3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data;
2797 3074
2798 len -= size; 3075 handle->page++;
2799 buf += size; 3076 handle->page &= data->nr_pages - 1;
2800 offset += size; 3077 handle->addr = data->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data);
3079 }
2801 } while (len); 3080 } while (len);
2802
2803 handle->offset = offset;
2804
2805 /*
2806 * Check we didn't copy past our reservation window, taking the
2807 * possible unsigned int wrap into account.
2808 */
2809 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2810} 3081}
2811 3082
2812int perf_output_begin(struct perf_output_handle *handle, 3083int perf_output_begin(struct perf_output_handle *handle,
2813 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
2814 int nmi, int sample) 3085 int nmi, int sample)
2815{ 3086{
2816 struct perf_event *output_event;
2817 struct perf_mmap_data *data; 3087 struct perf_mmap_data *data;
2818 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
2819 int have_lost; 3089 int have_lost;
@@ -2830,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
2830 if (event->parent) 3100 if (event->parent)
2831 event = event->parent; 3101 event = event->parent;
2832 3102
2833 output_event = rcu_dereference(event->output);
2834 if (output_event)
2835 event = output_event;
2836
2837 data = rcu_dereference(event->data); 3103 data = rcu_dereference(event->data);
2838 if (!data) 3104 if (!data)
2839 goto out; 3105 goto out;
@@ -2844,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle,
2844 handle->sample = sample; 3110 handle->sample = sample;
2845 3111
2846 if (!data->nr_pages) 3112 if (!data->nr_pages)
2847 goto fail; 3113 goto out;
2848 3114
2849 have_lost = atomic_read(&data->lost); 3115 have_lost = local_read(&data->lost);
2850 if (have_lost) 3116 if (have_lost)
2851 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
2852 3118
2853 perf_output_lock(handle); 3119 perf_output_get_handle(handle);
2854 3120
2855 do { 3121 do {
2856 /* 3122 /*
@@ -2860,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle,
2860 */ 3126 */
2861 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(data->user_page->data_tail);
2862 smp_rmb(); 3128 smp_rmb();
2863 offset = head = atomic_long_read(&data->head); 3129 offset = head = local_read(&data->head);
2864 head += size; 3130 head += size;
2865 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(data, tail, offset, head)))
2866 goto fail; 3132 goto fail;
2867 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&data->head, offset, head) != offset);
2868 3134
2869 handle->offset = offset; 3135 if (head - local_read(&data->wakeup) > data->watermark)
2870 handle->head = head; 3136 local_add(data->watermark, &data->wakeup);
2871 3137
2872 if (head - tail > data->watermark) 3138 handle->page = offset >> (PAGE_SHIFT + page_order(data));
2873 atomic_set(&data->wakeup, 1); 3139 handle->page &= data->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3141 handle->addr = data->data_pages[handle->page];
3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
2874 3144
2875 if (have_lost) { 3145 if (have_lost) {
2876 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
2877 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
2878 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
2879 lost_event.id = event->id; 3149 lost_event.id = event->id;
2880 lost_event.lost = atomic_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&data->lost, 0);
2881 3151
2882 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
2883 } 3153 }
@@ -2885,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
2885 return 0; 3155 return 0;
2886 3156
2887fail: 3157fail:
2888 atomic_inc(&data->lost); 3158 local_inc(&data->lost);
2889 perf_output_unlock(handle); 3159 perf_output_put_handle(handle);
2890out: 3160out:
2891 rcu_read_unlock(); 3161 rcu_read_unlock();
2892 3162
@@ -2901,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle)
2901 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
2902 3172
2903 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
2904 int events = atomic_inc_return(&data->events); 3174 int events = local_inc_return(&data->events);
2905 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
2906 atomic_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &data->events);
2907 atomic_set(&data->wakeup, 1); 3177 local_inc(&data->wakeup);
2908 } 3178 }
2909 } 3179 }
2910 3180
2911 perf_output_unlock(handle); 3181 perf_output_put_handle(handle);
2912 rcu_read_unlock(); 3182 rcu_read_unlock();
2913} 3183}
2914 3184
@@ -3243,9 +3513,8 @@ static void perf_event_task_output(struct perf_event *event,
3243 struct perf_task_event *task_event) 3513 struct perf_task_event *task_event)
3244{ 3514{
3245 struct perf_output_handle handle; 3515 struct perf_output_handle handle;
3246 int size;
3247 struct task_struct *task = task_event->task; 3516 struct task_struct *task = task_event->task;
3248 int ret; 3517 int size, ret;
3249 3518
3250 size = task_event->event_id.header.size; 3519 size = task_event->event_id.header.size;
3251 ret = perf_output_begin(&handle, event, size, 0, 0); 3520 ret = perf_output_begin(&handle, event, size, 0, 0);
@@ -3259,8 +3528,6 @@ static void perf_event_task_output(struct perf_event *event,
3259 task_event->event_id.tid = perf_event_tid(event, task); 3528 task_event->event_id.tid = perf_event_tid(event, task);
3260 task_event->event_id.ptid = perf_event_tid(event, current); 3529 task_event->event_id.ptid = perf_event_tid(event, current);
3261 3530
3262 task_event->event_id.time = perf_clock();
3263
3264 perf_output_put(&handle, task_event->event_id); 3531 perf_output_put(&handle, task_event->event_id);
3265 3532
3266 perf_output_end(&handle); 3533 perf_output_end(&handle);
@@ -3268,7 +3535,7 @@ static void perf_event_task_output(struct perf_event *event,
3268 3535
3269static int perf_event_task_match(struct perf_event *event) 3536static int perf_event_task_match(struct perf_event *event)
3270{ 3537{
3271 if (event->state != PERF_EVENT_STATE_ACTIVE) 3538 if (event->state < PERF_EVENT_STATE_INACTIVE)
3272 return 0; 3539 return 0;
3273 3540
3274 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3541 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3300,7 +3567,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3300 cpuctx = &get_cpu_var(perf_cpu_context); 3567 cpuctx = &get_cpu_var(perf_cpu_context);
3301 perf_event_task_ctx(&cpuctx->ctx, task_event); 3568 perf_event_task_ctx(&cpuctx->ctx, task_event);
3302 if (!ctx) 3569 if (!ctx)
3303 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3570 ctx = rcu_dereference(current->perf_event_ctxp);
3304 if (ctx) 3571 if (ctx)
3305 perf_event_task_ctx(ctx, task_event); 3572 perf_event_task_ctx(ctx, task_event);
3306 put_cpu_var(perf_cpu_context); 3573 put_cpu_var(perf_cpu_context);
@@ -3331,6 +3598,7 @@ static void perf_event_task(struct task_struct *task,
3331 /* .ppid */ 3598 /* .ppid */
3332 /* .tid */ 3599 /* .tid */
3333 /* .ptid */ 3600 /* .ptid */
3601 .time = perf_clock(),
3334 }, 3602 },
3335 }; 3603 };
3336 3604
@@ -3380,7 +3648,7 @@ static void perf_event_comm_output(struct perf_event *event,
3380 3648
3381static int perf_event_comm_match(struct perf_event *event) 3649static int perf_event_comm_match(struct perf_event *event)
3382{ 3650{
3383 if (event->state != PERF_EVENT_STATE_ACTIVE) 3651 if (event->state < PERF_EVENT_STATE_INACTIVE)
3384 return 0; 3652 return 0;
3385 3653
3386 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3654 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3500,7 +3768,7 @@ static void perf_event_mmap_output(struct perf_event *event,
3500static int perf_event_mmap_match(struct perf_event *event, 3768static int perf_event_mmap_match(struct perf_event *event,
3501 struct perf_mmap_event *mmap_event) 3769 struct perf_mmap_event *mmap_event)
3502{ 3770{
3503 if (event->state != PERF_EVENT_STATE_ACTIVE) 3771 if (event->state < PERF_EVENT_STATE_INACTIVE)
3504 return 0; 3772 return 0;
3505 3773
3506 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3774 if (event->cpu != -1 && event->cpu != smp_processor_id())
@@ -3602,14 +3870,14 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3602 .event_id = { 3870 .event_id = {
3603 .header = { 3871 .header = {
3604 .type = PERF_RECORD_MMAP, 3872 .type = PERF_RECORD_MMAP,
3605 .misc = 0, 3873 .misc = PERF_RECORD_MISC_USER,
3606 /* .size */ 3874 /* .size */
3607 }, 3875 },
3608 /* .pid */ 3876 /* .pid */
3609 /* .tid */ 3877 /* .tid */
3610 .start = vma->vm_start, 3878 .start = vma->vm_start,
3611 .len = vma->vm_end - vma->vm_start, 3879 .len = vma->vm_end - vma->vm_start,
3612 .pgoff = vma->vm_pgoff, 3880 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3613 }, 3881 },
3614 }; 3882 };
3615 3883
@@ -3689,12 +3957,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3689 3957
3690 if (event->attr.freq) { 3958 if (event->attr.freq) {
3691 u64 now = perf_clock(); 3959 u64 now = perf_clock();
3692 s64 delta = now - hwc->freq_stamp; 3960 s64 delta = now - hwc->freq_time_stamp;
3693 3961
3694 hwc->freq_stamp = now; 3962 hwc->freq_time_stamp = now;
3695 3963
3696 if (delta > 0 && delta < TICK_NSEC) 3964 if (delta > 0 && delta < 2*TICK_NSEC)
3697 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3965 perf_adjust_period(event, delta, hwc->last_period);
3698 } 3966 }
3699 3967
3700 /* 3968 /*
@@ -3790,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3790 } 4058 }
3791} 4059}
3792 4060
3793static void perf_swevent_unthrottle(struct perf_event *event)
3794{
3795 /*
3796 * Nothing to do, we already reset hwc->interrupts.
3797 */
3798}
3799
3800static void perf_swevent_add(struct perf_event *event, u64 nr, 4061static void perf_swevent_add(struct perf_event *event, u64 nr,
3801 int nmi, struct perf_sample_data *data, 4062 int nmi, struct perf_sample_data *data,
3802 struct pt_regs *regs) 4063 struct pt_regs *regs)
@@ -3820,39 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3820 perf_swevent_overflow(event, 0, nmi, data, regs); 4081 perf_swevent_overflow(event, 0, nmi, data, regs);
3821} 4082}
3822 4083
3823static int perf_swevent_is_counting(struct perf_event *event)
3824{
3825 /*
3826 * The event is active, we're good!
3827 */
3828 if (event->state == PERF_EVENT_STATE_ACTIVE)
3829 return 1;
3830
3831 /*
3832 * The event is off/error, not counting.
3833 */
3834 if (event->state != PERF_EVENT_STATE_INACTIVE)
3835 return 0;
3836
3837 /*
3838 * The event is inactive, if the context is active
3839 * we're part of a group that didn't make it on the 'pmu',
3840 * not counting.
3841 */
3842 if (event->ctx->is_active)
3843 return 0;
3844
3845 /*
3846 * We're inactive and the context is too, this means the
3847 * task is scheduled out, we're counting events that happen
3848 * to us, like migration events.
3849 */
3850 return 1;
3851}
3852
3853static int perf_tp_event_match(struct perf_event *event,
3854 struct perf_sample_data *data);
3855
3856static int perf_exclude_event(struct perf_event *event, 4084static int perf_exclude_event(struct perf_event *event,
3857 struct pt_regs *regs) 4085 struct pt_regs *regs)
3858{ 4086{
@@ -3873,12 +4101,6 @@ static int perf_swevent_match(struct perf_event *event,
3873 struct perf_sample_data *data, 4101 struct perf_sample_data *data,
3874 struct pt_regs *regs) 4102 struct pt_regs *regs)
3875{ 4103{
3876 if (event->cpu != -1 && event->cpu != smp_processor_id())
3877 return 0;
3878
3879 if (!perf_swevent_is_counting(event))
3880 return 0;
3881
3882 if (event->attr.type != type) 4104 if (event->attr.type != type)
3883 return 0; 4105 return 0;
3884 4106
@@ -3888,30 +4110,88 @@ static int perf_swevent_match(struct perf_event *event,
3888 if (perf_exclude_event(event, regs)) 4110 if (perf_exclude_event(event, regs))
3889 return 0; 4111 return 0;
3890 4112
3891 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3892 !perf_tp_event_match(event, data))
3893 return 0;
3894
3895 return 1; 4113 return 1;
3896} 4114}
3897 4115
3898static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4116static inline u64 swevent_hash(u64 type, u32 event_id)
3899 enum perf_type_id type, 4117{
3900 u32 event_id, u64 nr, int nmi, 4118 u64 val = event_id | (type << 32);
3901 struct perf_sample_data *data, 4119
3902 struct pt_regs *regs) 4120 return hash_64(val, SWEVENT_HLIST_BITS);
4121}
4122
4123static inline struct hlist_head *
4124__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
3903{ 4125{
4126 u64 hash = swevent_hash(type, event_id);
4127
4128 return &hlist->heads[hash];
4129}
4130
4131/* For the read side: events when they trigger */
4132static inline struct hlist_head *
4133find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4134{
4135 struct swevent_hlist *hlist;
4136
4137 hlist = rcu_dereference(ctx->swevent_hlist);
4138 if (!hlist)
4139 return NULL;
4140
4141 return __find_swevent_head(hlist, type, event_id);
4142}
4143
4144/* For the event head insertion and removal in the hlist */
4145static inline struct hlist_head *
4146find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4147{
4148 struct swevent_hlist *hlist;
4149 u32 event_id = event->attr.config;
4150 u64 type = event->attr.type;
4151
4152 /*
4153 * Event scheduling is always serialized against hlist allocation
4154 * and release. Which makes the protected version suitable here.
4155 * The context lock guarantees that.
4156 */
4157 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4158 lockdep_is_held(&event->ctx->lock));
4159 if (!hlist)
4160 return NULL;
4161
4162 return __find_swevent_head(hlist, type, event_id);
4163}
4164
4165static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4166 u64 nr, int nmi,
4167 struct perf_sample_data *data,
4168 struct pt_regs *regs)
4169{
4170 struct perf_cpu_context *cpuctx;
3904 struct perf_event *event; 4171 struct perf_event *event;
4172 struct hlist_node *node;
4173 struct hlist_head *head;
3905 4174
3906 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4175 cpuctx = &__get_cpu_var(perf_cpu_context);
4176
4177 rcu_read_lock();
4178
4179 head = find_swevent_head_rcu(cpuctx, type, event_id);
4180
4181 if (!head)
4182 goto end;
4183
4184 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
3907 if (perf_swevent_match(event, type, event_id, data, regs)) 4185 if (perf_swevent_match(event, type, event_id, data, regs))
3908 perf_swevent_add(event, nr, nmi, data, regs); 4186 perf_swevent_add(event, nr, nmi, data, regs);
3909 } 4187 }
4188end:
4189 rcu_read_unlock();
3910} 4190}
3911 4191
3912int perf_swevent_get_recursion_context(void) 4192int perf_swevent_get_recursion_context(void)
3913{ 4193{
3914 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4194 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3915 int rctx; 4195 int rctx;
3916 4196
3917 if (in_nmi()) 4197 if (in_nmi())
@@ -3923,10 +4203,8 @@ int perf_swevent_get_recursion_context(void)
3923 else 4203 else
3924 rctx = 0; 4204 rctx = 0;
3925 4205
3926 if (cpuctx->recursion[rctx]) { 4206 if (cpuctx->recursion[rctx])
3927 put_cpu_var(perf_cpu_context);
3928 return -1; 4207 return -1;
3929 }
3930 4208
3931 cpuctx->recursion[rctx]++; 4209 cpuctx->recursion[rctx]++;
3932 barrier(); 4210 barrier();
@@ -3940,31 +4218,9 @@ void perf_swevent_put_recursion_context(int rctx)
3940 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3941 barrier(); 4219 barrier();
3942 cpuctx->recursion[rctx]--; 4220 cpuctx->recursion[rctx]--;
3943 put_cpu_var(perf_cpu_context);
3944} 4221}
3945EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3946 4223
3947static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3948 u64 nr, int nmi,
3949 struct perf_sample_data *data,
3950 struct pt_regs *regs)
3951{
3952 struct perf_cpu_context *cpuctx;
3953 struct perf_event_context *ctx;
3954
3955 cpuctx = &__get_cpu_var(perf_cpu_context);
3956 rcu_read_lock();
3957 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3958 nr, nmi, data, regs);
3959 /*
3960 * doesn't really matter which of the child contexts the
3961 * events ends up in.
3962 */
3963 ctx = rcu_dereference(current->perf_event_ctxp);
3964 if (ctx)
3965 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3966 rcu_read_unlock();
3967}
3968 4224
3969void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4225void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3970 struct pt_regs *regs, u64 addr) 4226 struct pt_regs *regs, u64 addr)
@@ -3972,16 +4228,17 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3972 struct perf_sample_data data; 4228 struct perf_sample_data data;
3973 int rctx; 4229 int rctx;
3974 4230
4231 preempt_disable_notrace();
3975 rctx = perf_swevent_get_recursion_context(); 4232 rctx = perf_swevent_get_recursion_context();
3976 if (rctx < 0) 4233 if (rctx < 0)
3977 return; 4234 return;
3978 4235
3979 data.addr = addr; 4236 perf_sample_data_init(&data, addr);
3980 data.raw = NULL;
3981 4237
3982 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4238 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3983 4239
3984 perf_swevent_put_recursion_context(rctx); 4240 perf_swevent_put_recursion_context(rctx);
4241 preempt_enable_notrace();
3985} 4242}
3986 4243
3987static void perf_swevent_read(struct perf_event *event) 4244static void perf_swevent_read(struct perf_event *event)
@@ -3991,23 +4248,46 @@ static void perf_swevent_read(struct perf_event *event)
3991static int perf_swevent_enable(struct perf_event *event) 4248static int perf_swevent_enable(struct perf_event *event)
3992{ 4249{
3993 struct hw_perf_event *hwc = &event->hw; 4250 struct hw_perf_event *hwc = &event->hw;
4251 struct perf_cpu_context *cpuctx;
4252 struct hlist_head *head;
4253
4254 cpuctx = &__get_cpu_var(perf_cpu_context);
3994 4255
3995 if (hwc->sample_period) { 4256 if (hwc->sample_period) {
3996 hwc->last_period = hwc->sample_period; 4257 hwc->last_period = hwc->sample_period;
3997 perf_swevent_set_period(event); 4258 perf_swevent_set_period(event);
3998 } 4259 }
4260
4261 head = find_swevent_head(cpuctx, event);
4262 if (WARN_ON_ONCE(!head))
4263 return -EINVAL;
4264
4265 hlist_add_head_rcu(&event->hlist_entry, head);
4266
3999 return 0; 4267 return 0;
4000} 4268}
4001 4269
4002static void perf_swevent_disable(struct perf_event *event) 4270static void perf_swevent_disable(struct perf_event *event)
4003{ 4271{
4272 hlist_del_rcu(&event->hlist_entry);
4273}
4274
4275static void perf_swevent_void(struct perf_event *event)
4276{
4277}
4278
4279static int perf_swevent_int(struct perf_event *event)
4280{
4281 return 0;
4004} 4282}
4005 4283
4006static const struct pmu perf_ops_generic = { 4284static const struct pmu perf_ops_generic = {
4007 .enable = perf_swevent_enable, 4285 .enable = perf_swevent_enable,
4008 .disable = perf_swevent_disable, 4286 .disable = perf_swevent_disable,
4287 .start = perf_swevent_int,
4288 .stop = perf_swevent_void,
4009 .read = perf_swevent_read, 4289 .read = perf_swevent_read,
4010 .unthrottle = perf_swevent_unthrottle, 4290 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4011}; 4291};
4012 4292
4013/* 4293/*
@@ -4022,22 +4302,14 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4022 struct perf_event *event; 4302 struct perf_event *event;
4023 u64 period; 4303 u64 period;
4024 4304
4025 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4305 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4026 event->pmu->read(event); 4306 event->pmu->read(event);
4027 4307
4028 data.addr = 0; 4308 perf_sample_data_init(&data, 0);
4029 data.raw = NULL;
4030 data.period = event->hw.last_period; 4309 data.period = event->hw.last_period;
4031 regs = get_irq_regs(); 4310 regs = get_irq_regs();
4032 /*
4033 * In case we exclude kernel IPs or are somehow not in interrupt
4034 * context, provide the next best thing, the user IP.
4035 */
4036 if ((event->attr.exclude_kernel || !regs) &&
4037 !event->attr.exclude_user)
4038 regs = task_pt_regs(current);
4039 4311
4040 if (regs) { 4312 if (regs && !perf_exclude_event(event, regs)) {
4041 if (!(event->attr.exclude_idle && current->pid == 0)) 4313 if (!(event->attr.exclude_idle && current->pid == 0))
4042 if (perf_event_overflow(event, 0, &data, regs)) 4314 if (perf_event_overflow(event, 0, &data, regs))
4043 ret = HRTIMER_NORESTART; 4315 ret = HRTIMER_NORESTART;
@@ -4185,33 +4457,124 @@ static const struct pmu perf_ops_task_clock = {
4185 .read = task_clock_perf_event_read, 4457 .read = task_clock_perf_event_read,
4186}; 4458};
4187 4459
4188#ifdef CONFIG_EVENT_PROFILE 4460/* Deref the hlist from the update side */
4461static inline struct swevent_hlist *
4462swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4463{
4464 return rcu_dereference_protected(cpuctx->swevent_hlist,
4465 lockdep_is_held(&cpuctx->hlist_mutex));
4466}
4189 4467
4190void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4468static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4191 int entry_size)
4192{ 4469{
4193 struct perf_raw_record raw = { 4470 struct swevent_hlist *hlist;
4194 .size = entry_size,
4195 .data = record,
4196 };
4197 4471
4198 struct perf_sample_data data = { 4472 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4199 .addr = addr, 4473 kfree(hlist);
4200 .raw = &raw, 4474}
4201 };
4202 4475
4203 struct pt_regs *regs = get_irq_regs(); 4476static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4477{
4478 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4204 4479
4205 if (!regs) 4480 if (!hlist)
4206 regs = task_pt_regs(current); 4481 return;
4207 4482
4208 /* Trace events already protected against recursion */ 4483 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4209 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4484 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4210 &data, regs);
4211} 4485}
4212EXPORT_SYMBOL_GPL(perf_tp_event);
4213 4486
4214static int perf_tp_event_match(struct perf_event *event, 4487static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4488{
4489 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4490
4491 mutex_lock(&cpuctx->hlist_mutex);
4492
4493 if (!--cpuctx->hlist_refcount)
4494 swevent_hlist_release(cpuctx);
4495
4496 mutex_unlock(&cpuctx->hlist_mutex);
4497}
4498
4499static void swevent_hlist_put(struct perf_event *event)
4500{
4501 int cpu;
4502
4503 if (event->cpu != -1) {
4504 swevent_hlist_put_cpu(event, event->cpu);
4505 return;
4506 }
4507
4508 for_each_possible_cpu(cpu)
4509 swevent_hlist_put_cpu(event, cpu);
4510}
4511
4512static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4513{
4514 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4515 int err = 0;
4516
4517 mutex_lock(&cpuctx->hlist_mutex);
4518
4519 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4520 struct swevent_hlist *hlist;
4521
4522 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4523 if (!hlist) {
4524 err = -ENOMEM;
4525 goto exit;
4526 }
4527 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4528 }
4529 cpuctx->hlist_refcount++;
4530 exit:
4531 mutex_unlock(&cpuctx->hlist_mutex);
4532
4533 return err;
4534}
4535
4536static int swevent_hlist_get(struct perf_event *event)
4537{
4538 int err;
4539 int cpu, failed_cpu;
4540
4541 if (event->cpu != -1)
4542 return swevent_hlist_get_cpu(event, event->cpu);
4543
4544 get_online_cpus();
4545 for_each_possible_cpu(cpu) {
4546 err = swevent_hlist_get_cpu(event, cpu);
4547 if (err) {
4548 failed_cpu = cpu;
4549 goto fail;
4550 }
4551 }
4552 put_online_cpus();
4553
4554 return 0;
4555 fail:
4556 for_each_possible_cpu(cpu) {
4557 if (cpu == failed_cpu)
4558 break;
4559 swevent_hlist_put_cpu(event, cpu);
4560 }
4561
4562 put_online_cpus();
4563 return err;
4564}
4565
4566#ifdef CONFIG_EVENT_TRACING
4567
4568static const struct pmu perf_ops_tracepoint = {
4569 .enable = perf_trace_enable,
4570 .disable = perf_trace_disable,
4571 .start = perf_swevent_int,
4572 .stop = perf_swevent_void,
4573 .read = perf_swevent_read,
4574 .unthrottle = perf_swevent_void,
4575};
4576
4577static int perf_tp_filter_match(struct perf_event *event,
4215 struct perf_sample_data *data) 4578 struct perf_sample_data *data)
4216{ 4579{
4217 void *record = data->raw->data; 4580 void *record = data->raw->data;
@@ -4221,13 +4584,55 @@ static int perf_tp_event_match(struct perf_event *event,
4221 return 0; 4584 return 0;
4222} 4585}
4223 4586
4587static int perf_tp_event_match(struct perf_event *event,
4588 struct perf_sample_data *data,
4589 struct pt_regs *regs)
4590{
4591 /*
4592 * All tracepoints are from kernel-space.
4593 */
4594 if (event->attr.exclude_kernel)
4595 return 0;
4596
4597 if (!perf_tp_filter_match(event, data))
4598 return 0;
4599
4600 return 1;
4601}
4602
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head)
4605{
4606 struct perf_sample_data data;
4607 struct perf_event *event;
4608 struct hlist_node *node;
4609
4610 struct perf_raw_record raw = {
4611 .size = entry_size,
4612 .data = record,
4613 };
4614
4615 perf_sample_data_init(&data, addr);
4616 data.raw = &raw;
4617
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs);
4622 }
4623 rcu_read_unlock();
4624}
4625EXPORT_SYMBOL_GPL(perf_tp_event);
4626
4224static void tp_perf_event_destroy(struct perf_event *event) 4627static void tp_perf_event_destroy(struct perf_event *event)
4225{ 4628{
4226 ftrace_profile_disable(event->attr.config); 4629 perf_trace_destroy(event);
4227} 4630}
4228 4631
4229static const struct pmu *tp_perf_event_init(struct perf_event *event) 4632static const struct pmu *tp_perf_event_init(struct perf_event *event)
4230{ 4633{
4634 int err;
4635
4231 /* 4636 /*
4232 * Raw tracepoint data is a severe data leak, only allow root to 4637 * Raw tracepoint data is a severe data leak, only allow root to
4233 * have these. 4638 * have these.
@@ -4237,12 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4237 !capable(CAP_SYS_ADMIN)) 4642 !capable(CAP_SYS_ADMIN))
4238 return ERR_PTR(-EPERM); 4643 return ERR_PTR(-EPERM);
4239 4644
4240 if (ftrace_profile_enable(event->attr.config)) 4645 err = perf_trace_init(event);
4646 if (err)
4241 return NULL; 4647 return NULL;
4242 4648
4243 event->destroy = tp_perf_event_destroy; 4649 event->destroy = tp_perf_event_destroy;
4244 4650
4245 return &perf_ops_generic; 4651 return &perf_ops_tracepoint;
4246} 4652}
4247 4653
4248static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4654static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4270,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event)
4270 4676
4271#else 4677#else
4272 4678
4273static int perf_tp_event_match(struct perf_event *event,
4274 struct perf_sample_data *data)
4275{
4276 return 1;
4277}
4278
4279static const struct pmu *tp_perf_event_init(struct perf_event *event) 4679static const struct pmu *tp_perf_event_init(struct perf_event *event)
4280{ 4680{
4281 return NULL; 4681 return NULL;
@@ -4290,7 +4690,7 @@ static void perf_event_free_filter(struct perf_event *event)
4290{ 4690{
4291} 4691}
4292 4692
4293#endif /* CONFIG_EVENT_PROFILE */ 4693#endif /* CONFIG_EVENT_TRACING */
4294 4694
4295#ifdef CONFIG_HAVE_HW_BREAKPOINT 4695#ifdef CONFIG_HAVE_HW_BREAKPOINT
4296static void bp_perf_event_destroy(struct perf_event *event) 4696static void bp_perf_event_destroy(struct perf_event *event)
@@ -4316,8 +4716,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4316 struct perf_sample_data sample; 4716 struct perf_sample_data sample;
4317 struct pt_regs *regs = data; 4717 struct pt_regs *regs = data;
4318 4718
4319 sample.raw = NULL; 4719 perf_sample_data_init(&sample, bp->attr.bp_addr);
4320 sample.addr = bp->attr.bp_addr;
4321 4720
4322 if (!perf_exclude_event(bp, regs)) 4721 if (!perf_exclude_event(bp, regs))
4323 perf_swevent_add(bp, 1, 1, &sample, regs); 4722 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4342,6 +4741,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4342 WARN_ON(event->parent); 4741 WARN_ON(event->parent);
4343 4742
4344 atomic_dec(&perf_swevent_enabled[event_id]); 4743 atomic_dec(&perf_swevent_enabled[event_id]);
4744 swevent_hlist_put(event);
4345} 4745}
4346 4746
4347static const struct pmu *sw_perf_event_init(struct perf_event *event) 4747static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4380,6 +4780,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4380 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4780 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4381 case PERF_COUNT_SW_EMULATION_FAULTS: 4781 case PERF_COUNT_SW_EMULATION_FAULTS:
4382 if (!event->parent) { 4782 if (!event->parent) {
4783 int err;
4784
4785 err = swevent_hlist_get(event);
4786 if (err)
4787 return ERR_PTR(err);
4788
4383 atomic_inc(&perf_swevent_enabled[event_id]); 4789 atomic_inc(&perf_swevent_enabled[event_id]);
4384 event->destroy = sw_perf_event_destroy; 4790 event->destroy = sw_perf_event_destroy;
4385 } 4791 }
@@ -4580,7 +4986,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4580 if (attr->type >= PERF_TYPE_MAX) 4986 if (attr->type >= PERF_TYPE_MAX)
4581 return -EINVAL; 4987 return -EINVAL;
4582 4988
4583 if (attr->__reserved_1 || attr->__reserved_2) 4989 if (attr->__reserved_1)
4584 return -EINVAL; 4990 return -EINVAL;
4585 4991
4586 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4992 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4598,54 +5004,53 @@ err_size:
4598 goto out; 5004 goto out;
4599} 5005}
4600 5006
4601static int perf_event_set_output(struct perf_event *event, int output_fd) 5007static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
4602{ 5009{
4603 struct perf_event *output_event = NULL; 5010 struct perf_mmap_data *data = NULL, *old_data = NULL;
4604 struct file *output_file = NULL;
4605 struct perf_event *old_output;
4606 int fput_needed = 0;
4607 int ret = -EINVAL; 5011 int ret = -EINVAL;
4608 5012
4609 if (!output_fd) 5013 if (!output_event)
4610 goto set; 5014 goto set;
4611 5015
4612 output_file = fget_light(output_fd, &fput_needed); 5016 /* don't allow circular references */
4613 if (!output_file) 5017 if (event == output_event)
4614 return -EBADF;
4615
4616 if (output_file->f_op != &perf_fops)
4617 goto out; 5018 goto out;
4618 5019
4619 output_event = output_file->private_data; 5020 /*
4620 5021 * Don't allow cross-cpu buffers
4621 /* Don't chain output fds */ 5022 */
4622 if (output_event->output) 5023 if (output_event->cpu != event->cpu)
4623 goto out; 5024 goto out;
4624 5025
4625 /* Don't set an output fd when we already have an output channel */ 5026 /*
4626 if (event->data) 5027 * If its not a per-cpu buffer, it must be the same task.
5028 */
5029 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4627 goto out; 5030 goto out;
4628 5031
4629 atomic_long_inc(&output_file->f_count);
4630
4631set: 5032set:
4632 mutex_lock(&event->mmap_mutex); 5033 mutex_lock(&event->mmap_mutex);
4633 old_output = event->output; 5034 /* Can't redirect output if we've got an active mmap() */
4634 rcu_assign_pointer(event->output, output_event); 5035 if (atomic_read(&event->mmap_count))
4635 mutex_unlock(&event->mmap_mutex); 5036 goto unlock;
4636 5037
4637 if (old_output) { 5038 if (output_event) {
4638 /* 5039 /* get the buffer we want to redirect to */
4639 * we need to make sure no existing perf_output_*() 5040 data = perf_mmap_data_get(output_event);
4640 * is still referencing this event. 5041 if (!data)
4641 */ 5042 goto unlock;
4642 synchronize_rcu();
4643 fput(old_output->filp);
4644 } 5043 }
4645 5044
5045 old_data = event->data;
5046 rcu_assign_pointer(event->data, data);
4646 ret = 0; 5047 ret = 0;
5048unlock:
5049 mutex_unlock(&event->mmap_mutex);
5050
5051 if (old_data)
5052 perf_mmap_data_put(old_data);
4647out: 5053out:
4648 fput_light(output_file, fput_needed);
4649 return ret; 5054 return ret;
4650} 5055}
4651 5056
@@ -4661,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open,
4661 struct perf_event_attr __user *, attr_uptr, 5066 struct perf_event_attr __user *, attr_uptr,
4662 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5067 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4663{ 5068{
4664 struct perf_event *event, *group_leader; 5069 struct perf_event *event, *group_leader = NULL, *output_event = NULL;
4665 struct perf_event_attr attr; 5070 struct perf_event_attr attr;
4666 struct perf_event_context *ctx; 5071 struct perf_event_context *ctx;
4667 struct file *event_file = NULL; 5072 struct file *event_file = NULL;
4668 struct file *group_file = NULL; 5073 struct file *group_file = NULL;
5074 int event_fd;
4669 int fput_needed = 0; 5075 int fput_needed = 0;
4670 int fput_needed2 = 0;
4671 int err; 5076 int err;
4672 5077
4673 /* for future expandability... */ 5078 /* for future expandability... */
@@ -4688,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open,
4688 return -EINVAL; 5093 return -EINVAL;
4689 } 5094 }
4690 5095
5096 event_fd = get_unused_fd_flags(O_RDWR);
5097 if (event_fd < 0)
5098 return event_fd;
5099
4691 /* 5100 /*
4692 * Get the target context (task or percpu): 5101 * Get the target context (task or percpu):
4693 */ 5102 */
4694 ctx = find_get_context(pid, cpu); 5103 ctx = find_get_context(pid, cpu);
4695 if (IS_ERR(ctx)) 5104 if (IS_ERR(ctx)) {
4696 return PTR_ERR(ctx); 5105 err = PTR_ERR(ctx);
5106 goto err_fd;
5107 }
5108
5109 if (group_fd != -1) {
5110 group_leader = perf_fget_light(group_fd, &fput_needed);
5111 if (IS_ERR(group_leader)) {
5112 err = PTR_ERR(group_leader);
5113 goto err_put_context;
5114 }
5115 group_file = group_leader->filp;
5116 if (flags & PERF_FLAG_FD_OUTPUT)
5117 output_event = group_leader;
5118 if (flags & PERF_FLAG_FD_NO_GROUP)
5119 group_leader = NULL;
5120 }
4697 5121
4698 /* 5122 /*
4699 * Look up the group leader (we will attach this event to it): 5123 * Look up the group leader (we will attach this event to it):
4700 */ 5124 */
4701 group_leader = NULL; 5125 if (group_leader) {
4702 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4703 err = -EINVAL; 5126 err = -EINVAL;
4704 group_file = fget_light(group_fd, &fput_needed);
4705 if (!group_file)
4706 goto err_put_context;
4707 if (group_file->f_op != &perf_fops)
4708 goto err_put_context;
4709 5127
4710 group_leader = group_file->private_data;
4711 /* 5128 /*
4712 * Do not allow a recursive hierarchy (this new sibling 5129 * Do not allow a recursive hierarchy (this new sibling
4713 * becoming part of another group-sibling): 5130 * becoming part of another group-sibling):
@@ -4729,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open,
4729 5146
4730 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5147 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4731 NULL, NULL, GFP_KERNEL); 5148 NULL, NULL, GFP_KERNEL);
4732 err = PTR_ERR(event); 5149 if (IS_ERR(event)) {
4733 if (IS_ERR(event)) 5150 err = PTR_ERR(event);
4734 goto err_put_context; 5151 goto err_put_context;
5152 }
4735 5153
4736 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); 5154 if (output_event) {
4737 if (err < 0) 5155 err = perf_event_set_output(event, output_event);
4738 goto err_free_put_context; 5156 if (err)
5157 goto err_free_put_context;
5158 }
4739 5159
4740 event_file = fget_light(err, &fput_needed2); 5160 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
4741 if (!event_file) 5161 if (IS_ERR(event_file)) {
5162 err = PTR_ERR(event_file);
4742 goto err_free_put_context; 5163 goto err_free_put_context;
4743
4744 if (flags & PERF_FLAG_FD_OUTPUT) {
4745 err = perf_event_set_output(event, group_fd);
4746 if (err)
4747 goto err_fput_free_put_context;
4748 } 5164 }
4749 5165
4750 event->filp = event_file; 5166 event->filp = event_file;
@@ -4760,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open,
4760 list_add_tail(&event->owner_entry, &current->perf_event_list); 5176 list_add_tail(&event->owner_entry, &current->perf_event_list);
4761 mutex_unlock(&current->perf_event_mutex); 5177 mutex_unlock(&current->perf_event_mutex);
4762 5178
4763err_fput_free_put_context: 5179 /*
4764 fput_light(event_file, fput_needed2); 5180 * Drop the reference on the group_event after placing the
5181 * new event on the sibling_list. This ensures destruction
5182 * of the group leader will find the pointer to itself in
5183 * perf_group_detach().
5184 */
5185 fput_light(group_file, fput_needed);
5186 fd_install(event_fd, event_file);
5187 return event_fd;
4765 5188
4766err_free_put_context: 5189err_free_put_context:
4767 if (err < 0) 5190 free_event(event);
4768 kfree(event);
4769
4770err_put_context: 5191err_put_context:
4771 if (err < 0)
4772 put_ctx(ctx);
4773
4774 fput_light(group_file, fput_needed); 5192 fput_light(group_file, fput_needed);
4775 5193 put_ctx(ctx);
5194err_fd:
5195 put_unused_fd(event_fd);
4776 return err; 5196 return err;
4777} 5197}
4778 5198
@@ -4871,8 +5291,15 @@ inherit_event(struct perf_event *parent_event,
4871 else 5291 else
4872 child_event->state = PERF_EVENT_STATE_OFF; 5292 child_event->state = PERF_EVENT_STATE_OFF;
4873 5293
4874 if (parent_event->attr.freq) 5294 if (parent_event->attr.freq) {
4875 child_event->hw.sample_period = parent_event->hw.sample_period; 5295 u64 sample_period = parent_event->hw.sample_period;
5296 struct hw_perf_event *hwc = &child_event->hw;
5297
5298 hwc->sample_period = sample_period;
5299 hwc->last_period = sample_period;
5300
5301 atomic64_set(&hwc->period_left, sample_period);
5302 }
4876 5303
4877 child_event->overflow_handler = parent_event->overflow_handler; 5304 child_event->overflow_handler = parent_event->overflow_handler;
4878 5305
@@ -5037,10 +5464,14 @@ void perf_event_exit_task(struct task_struct *child)
5037 * 5464 *
5038 * But since its the parent context it won't be the same instance. 5465 * But since its the parent context it won't be the same instance.
5039 */ 5466 */
5040 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5467 mutex_lock(&child_ctx->mutex);
5041 5468
5042again: 5469again:
5043 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5470 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5471 group_entry)
5472 __perf_event_exit_task(child_event, child_ctx, child);
5473
5474 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5044 group_entry) 5475 group_entry)
5045 __perf_event_exit_task(child_event, child_ctx, child); 5476 __perf_event_exit_task(child_event, child_ctx, child);
5046 5477
@@ -5049,7 +5480,8 @@ again:
5049 * its siblings to the list, but we obtained 'tmp' before that which 5480 * its siblings to the list, but we obtained 'tmp' before that which
5050 * will still point to the list head terminating the iteration. 5481 * will still point to the list head terminating the iteration.
5051 */ 5482 */
5052 if (!list_empty(&child_ctx->group_list)) 5483 if (!list_empty(&child_ctx->pinned_groups) ||
5484 !list_empty(&child_ctx->flexible_groups))
5053 goto again; 5485 goto again;
5054 5486
5055 mutex_unlock(&child_ctx->mutex); 5487 mutex_unlock(&child_ctx->mutex);
@@ -5057,6 +5489,25 @@ again:
5057 put_ctx(child_ctx); 5489 put_ctx(child_ctx);
5058} 5490}
5059 5491
5492static void perf_free_event(struct perf_event *event,
5493 struct perf_event_context *ctx)
5494{
5495 struct perf_event *parent = event->parent;
5496
5497 if (WARN_ON_ONCE(!parent))
5498 return;
5499
5500 mutex_lock(&parent->child_mutex);
5501 list_del_init(&event->child_list);
5502 mutex_unlock(&parent->child_mutex);
5503
5504 fput(parent->filp);
5505
5506 perf_group_detach(event);
5507 list_del_event(event, ctx);
5508 free_event(event);
5509}
5510
5060/* 5511/*
5061 * free an unexposed, unused context as created by inheritance by 5512 * free an unexposed, unused context as created by inheritance by
5062 * init_task below, used by fork() in case of fail. 5513 * init_task below, used by fork() in case of fail.
@@ -5071,36 +5522,70 @@ void perf_event_free_task(struct task_struct *task)
5071 5522
5072 mutex_lock(&ctx->mutex); 5523 mutex_lock(&ctx->mutex);
5073again: 5524again:
5074 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5525 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5075 struct perf_event *parent = event->parent; 5526 perf_free_event(event, ctx);
5076 5527
5077 if (WARN_ON_ONCE(!parent)) 5528 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5078 continue; 5529 group_entry)
5530 perf_free_event(event, ctx);
5079 5531
5080 mutex_lock(&parent->child_mutex); 5532 if (!list_empty(&ctx->pinned_groups) ||
5081 list_del_init(&event->child_list); 5533 !list_empty(&ctx->flexible_groups))
5082 mutex_unlock(&parent->child_mutex); 5534 goto again;
5083 5535
5084 fput(parent->filp); 5536 mutex_unlock(&ctx->mutex);
5085 5537
5086 list_del_event(event, ctx); 5538 put_ctx(ctx);
5087 free_event(event); 5539}
5540
5541static int
5542inherit_task_group(struct perf_event *event, struct task_struct *parent,
5543 struct perf_event_context *parent_ctx,
5544 struct task_struct *child,
5545 int *inherited_all)
5546{
5547 int ret;
5548 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5549
5550 if (!event->attr.inherit) {
5551 *inherited_all = 0;
5552 return 0;
5088 } 5553 }
5089 5554
5090 if (!list_empty(&ctx->group_list)) 5555 if (!child_ctx) {
5091 goto again; 5556 /*
5557 * This is executed from the parent task context, so
5558 * inherit events that have been marked for cloning.
5559 * First allocate and initialize a context for the
5560 * child.
5561 */
5092 5562
5093 mutex_unlock(&ctx->mutex); 5563 child_ctx = kzalloc(sizeof(struct perf_event_context),
5564 GFP_KERNEL);
5565 if (!child_ctx)
5566 return -ENOMEM;
5094 5567
5095 put_ctx(ctx); 5568 __perf_event_init_context(child_ctx, child);
5569 child->perf_event_ctxp = child_ctx;
5570 get_task_struct(child);
5571 }
5572
5573 ret = inherit_group(event, parent, parent_ctx,
5574 child, child_ctx);
5575
5576 if (ret)
5577 *inherited_all = 0;
5578
5579 return ret;
5096} 5580}
5097 5581
5582
5098/* 5583/*
5099 * Initialize the perf_event context in task_struct 5584 * Initialize the perf_event context in task_struct
5100 */ 5585 */
5101int perf_event_init_task(struct task_struct *child) 5586int perf_event_init_task(struct task_struct *child)
5102{ 5587{
5103 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5588 struct perf_event_context *child_ctx, *parent_ctx;
5104 struct perf_event_context *cloned_ctx; 5589 struct perf_event_context *cloned_ctx;
5105 struct perf_event *event; 5590 struct perf_event *event;
5106 struct task_struct *parent = current; 5591 struct task_struct *parent = current;
@@ -5138,41 +5623,22 @@ int perf_event_init_task(struct task_struct *child)
5138 * We dont have to disable NMIs - we are only looking at 5623 * We dont have to disable NMIs - we are only looking at
5139 * the list, not manipulating it: 5624 * the list, not manipulating it:
5140 */ 5625 */
5141 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5626 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5142 5627 ret = inherit_task_group(event, parent, parent_ctx, child,
5143 if (!event->attr.inherit) { 5628 &inherited_all);
5144 inherited_all = 0; 5629 if (ret)
5145 continue; 5630 break;
5146 } 5631 }
5147
5148 if (!child->perf_event_ctxp) {
5149 /*
5150 * This is executed from the parent task context, so
5151 * inherit events that have been marked for cloning.
5152 * First allocate and initialize a context for the
5153 * child.
5154 */
5155
5156 child_ctx = kzalloc(sizeof(struct perf_event_context),
5157 GFP_KERNEL);
5158 if (!child_ctx) {
5159 ret = -ENOMEM;
5160 break;
5161 }
5162
5163 __perf_event_init_context(child_ctx, child);
5164 child->perf_event_ctxp = child_ctx;
5165 get_task_struct(child);
5166 }
5167 5632
5168 ret = inherit_group(event, parent, parent_ctx, 5633 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5169 child, child_ctx); 5634 ret = inherit_task_group(event, parent, parent_ctx, child,
5170 if (ret) { 5635 &inherited_all);
5171 inherited_all = 0; 5636 if (ret)
5172 break; 5637 break;
5173 }
5174 } 5638 }
5175 5639
5640 child_ctx = child->perf_event_ctxp;
5641
5176 if (child_ctx && inherited_all) { 5642 if (child_ctx && inherited_all) {
5177 /* 5643 /*
5178 * Mark the child context as a clone of the parent 5644 * Mark the child context as a clone of the parent
@@ -5200,18 +5666,37 @@ int perf_event_init_task(struct task_struct *child)
5200 return ret; 5666 return ret;
5201} 5667}
5202 5668
5669static void __init perf_event_init_all_cpus(void)
5670{
5671 int cpu;
5672 struct perf_cpu_context *cpuctx;
5673
5674 for_each_possible_cpu(cpu) {
5675 cpuctx = &per_cpu(perf_cpu_context, cpu);
5676 mutex_init(&cpuctx->hlist_mutex);
5677 __perf_event_init_context(&cpuctx->ctx, NULL);
5678 }
5679}
5680
5203static void __cpuinit perf_event_init_cpu(int cpu) 5681static void __cpuinit perf_event_init_cpu(int cpu)
5204{ 5682{
5205 struct perf_cpu_context *cpuctx; 5683 struct perf_cpu_context *cpuctx;
5206 5684
5207 cpuctx = &per_cpu(perf_cpu_context, cpu); 5685 cpuctx = &per_cpu(perf_cpu_context, cpu);
5208 __perf_event_init_context(&cpuctx->ctx, NULL);
5209 5686
5210 spin_lock(&perf_resource_lock); 5687 spin_lock(&perf_resource_lock);
5211 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5688 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5212 spin_unlock(&perf_resource_lock); 5689 spin_unlock(&perf_resource_lock);
5213 5690
5214 hw_perf_event_setup(cpu); 5691 mutex_lock(&cpuctx->hlist_mutex);
5692 if (cpuctx->hlist_refcount > 0) {
5693 struct swevent_hlist *hlist;
5694
5695 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5696 WARN_ON_ONCE(!hlist);
5697 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5698 }
5699 mutex_unlock(&cpuctx->hlist_mutex);
5215} 5700}
5216 5701
5217#ifdef CONFIG_HOTPLUG_CPU 5702#ifdef CONFIG_HOTPLUG_CPU
@@ -5221,7 +5706,9 @@ static void __perf_event_exit_cpu(void *info)
5221 struct perf_event_context *ctx = &cpuctx->ctx; 5706 struct perf_event_context *ctx = &cpuctx->ctx;
5222 struct perf_event *event, *tmp; 5707 struct perf_event *event, *tmp;
5223 5708
5224 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5709 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5710 __perf_event_remove_from_context(event);
5711 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5225 __perf_event_remove_from_context(event); 5712 __perf_event_remove_from_context(event);
5226} 5713}
5227static void perf_event_exit_cpu(int cpu) 5714static void perf_event_exit_cpu(int cpu)
@@ -5229,6 +5716,10 @@ static void perf_event_exit_cpu(int cpu)
5229 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5716 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5230 struct perf_event_context *ctx = &cpuctx->ctx; 5717 struct perf_event_context *ctx = &cpuctx->ctx;
5231 5718
5719 mutex_lock(&cpuctx->hlist_mutex);
5720 swevent_hlist_release(cpuctx);
5721 mutex_unlock(&cpuctx->hlist_mutex);
5722
5232 mutex_lock(&ctx->mutex); 5723 mutex_lock(&ctx->mutex);
5233 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5724 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5234 mutex_unlock(&ctx->mutex); 5725 mutex_unlock(&ctx->mutex);
@@ -5249,11 +5740,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5249 perf_event_init_cpu(cpu); 5740 perf_event_init_cpu(cpu);
5250 break; 5741 break;
5251 5742
5252 case CPU_ONLINE:
5253 case CPU_ONLINE_FROZEN:
5254 hw_perf_event_setup_online(cpu);
5255 break;
5256
5257 case CPU_DOWN_PREPARE: 5743 case CPU_DOWN_PREPARE:
5258 case CPU_DOWN_PREPARE_FROZEN: 5744 case CPU_DOWN_PREPARE_FROZEN:
5259 perf_event_exit_cpu(cpu); 5745 perf_event_exit_cpu(cpu);
@@ -5276,6 +5762,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5276 5762
5277void __init perf_event_init(void) 5763void __init perf_event_init(void)
5278{ 5764{
5765 perf_event_init_all_cpus();
5279 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5766 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5280 (void *)(long)smp_processor_id()); 5767 (void *)(long)smp_processor_id());
5281 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5768 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5283,13 +5770,16 @@ void __init perf_event_init(void)
5283 register_cpu_notifier(&perf_cpu_nb); 5770 register_cpu_notifier(&perf_cpu_nb);
5284} 5771}
5285 5772
5286static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5773static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5774 struct sysdev_class_attribute *attr,
5775 char *buf)
5287{ 5776{
5288 return sprintf(buf, "%d\n", perf_reserved_percpu); 5777 return sprintf(buf, "%d\n", perf_reserved_percpu);
5289} 5778}
5290 5779
5291static ssize_t 5780static ssize_t
5292perf_set_reserve_percpu(struct sysdev_class *class, 5781perf_set_reserve_percpu(struct sysdev_class *class,
5782 struct sysdev_class_attribute *attr,
5293 const char *buf, 5783 const char *buf,
5294 size_t count) 5784 size_t count)
5295{ 5785{
@@ -5318,13 +5808,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5318 return count; 5808 return count;
5319} 5809}
5320 5810
5321static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5811static ssize_t perf_show_overcommit(struct sysdev_class *class,
5812 struct sysdev_class_attribute *attr,
5813 char *buf)
5322{ 5814{
5323 return sprintf(buf, "%d\n", perf_overcommit); 5815 return sprintf(buf, "%d\n", perf_overcommit);
5324} 5816}
5325 5817
5326static ssize_t 5818static ssize_t
5327perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5819perf_set_overcommit(struct sysdev_class *class,
5820 struct sysdev_class_attribute *attr,
5821 const char *buf, size_t count)
5328{ 5822{
5329 unsigned long val; 5823 unsigned long val;
5330 int err; 5824 int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index 2e17c9c92cbe..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first,
371 rcu_read_lock_held() ||
372 lockdep_tasklist_lock_is_held());
371 if (first) 373 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 374 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 375 }
@@ -376,7 +378,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
376EXPORT_SYMBOL(pid_task); 378EXPORT_SYMBOL(pid_task);
377 379
378/* 380/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 381 * Must be called under rcu_read_lock().
380 */ 382 */
381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 383struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382{ 384{
@@ -511,6 +513,13 @@ void __init pidhash_init(void)
511 513
512void __init pidmap_init(void) 514void __init pidmap_init(void)
513{ 515{
516 /* bump default and minimum pid_max based on number of cpus */
517 pid_max = min(pid_max_max, max_t(int, pid_max,
518 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
519 pid_max_min = max_t(int, pid_max_min,
520 PIDS_PER_CPU_MIN * num_possible_cpus());
521 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
522
514 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 523 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
515 /* Reserve PID 0. We never call free_pidmap(0) */ 524 /* Reserve PID 0. We never call free_pidmap(0) */
516 set_bit(0, init_pid_ns.pidmap[0].page); 525 set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h>
16 17
17#define BITS_PER_PAGE (PAGE_SIZE*8) 18#define BITS_PER_PAGE (PAGE_SIZE*8)
18 19
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
161 rcu_read_lock(); 162 rcu_read_lock();
162 163
163 /* 164 /*
164 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring 165 * Any nested-container's init processes won't ignore the
165 * any nested-container's init processes don't ignore the 166 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
166 * signal
167 */ 167 */
168 task = pid_task(find_vpid(nr), PIDTYPE_PID); 168 task = pid_task(find_vpid(nr), PIDTYPE_PID);
169 if (task) 169 if (task)
170 force_sig(SIGKILL, task); 170 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
171 171
172 rcu_read_unlock(); 172 rcu_read_unlock();
173 173
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -364,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
364 } 363 }
365 } else { 364 } else {
366 read_lock(&tasklist_lock); 365 read_lock(&tasklist_lock);
367 if (thread_group_leader(p) && p->signal) { 366 if (thread_group_leader(p) && p->sighand) {
368 error = 367 error =
369 cpu_clock_sample_group(which_clock, 368 cpu_clock_sample_group(which_clock,
370 p, &rtn); 369 p, &rtn);
@@ -440,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
440 439
441 if (likely(p != NULL)) { 440 if (likely(p != NULL)) {
442 read_lock(&tasklist_lock); 441 read_lock(&tasklist_lock);
443 if (unlikely(p->signal == NULL)) { 442 if (unlikely(p->sighand == NULL)) {
444 /* 443 /*
445 * We raced with the reaping of the task. 444 * We raced with the reaping of the task.
446 * The deletion should have cleared us off the list. 445 * The deletion should have cleared us off the list.
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -736,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
736 read_lock(&tasklist_lock); 691 read_lock(&tasklist_lock);
737 /* 692 /*
738 * We need the tasklist_lock to protect against reaping that 693 * We need the tasklist_lock to protect against reaping that
739 * clears p->signal. If p has just been reaped, we can no 694 * clears p->sighand. If p has just been reaped, we can no
740 * longer get any information about it at all. 695 * longer get any information about it at all.
741 */ 696 */
742 if (unlikely(p->signal == NULL)) { 697 if (unlikely(p->sighand == NULL)) {
743 read_unlock(&tasklist_lock); 698 read_unlock(&tasklist_lock);
744 put_task_struct(p); 699 put_task_struct(p);
745 timer->it.cpu.task = NULL; 700 timer->it.cpu.task = NULL;
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -908,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
908 clear_dead = p->exit_state; 863 clear_dead = p->exit_state;
909 } else { 864 } else {
910 read_lock(&tasklist_lock); 865 read_lock(&tasklist_lock);
911 if (unlikely(p->signal == NULL)) { 866 if (unlikely(p->sighand == NULL)) {
912 /* 867 /*
913 * The process has been reaped. 868 * The process has been reaped.
914 * We can't even collect a sample any more. 869 * We can't even collect a sample any more.
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -982,6 +918,7 @@ static void check_thread_timers(struct task_struct *tsk,
982 int maxfire; 918 int maxfire;
983 struct list_head *timers = tsk->cpu_timers; 919 struct list_head *timers = tsk->cpu_timers;
984 struct signal_struct *const sig = tsk->signal; 920 struct signal_struct *const sig = tsk->signal;
921 unsigned long soft;
985 922
986 maxfire = 20; 923 maxfire = 20;
987 tsk->cputime_expires.prof_exp = cputime_zero; 924 tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +967,10 @@ static void check_thread_timers(struct task_struct *tsk,
1030 /* 967 /*
1031 * Check for the special case thread timers. 968 * Check for the special case thread timers.
1032 */ 969 */
1033 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { 970 soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
1034 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; 971 if (soft != RLIM_INFINITY) {
1035 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; 972 unsigned long hard =
973 ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
1036 974
1037 if (hard != RLIM_INFINITY && 975 if (hard != RLIM_INFINITY &&
1038 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { 976 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +981,13 @@ static void check_thread_timers(struct task_struct *tsk,
1043 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 981 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1044 return; 982 return;
1045 } 983 }
1046 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { 984 if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
1047 /* 985 /*
1048 * At the soft limit, send a SIGXCPU every second. 986 * At the soft limit, send a SIGXCPU every second.
1049 */ 987 */
1050 if (sig->rlim[RLIMIT_RTTIME].rlim_cur 988 if (soft < hard) {
1051 < sig->rlim[RLIMIT_RTTIME].rlim_max) { 989 soft += USEC_PER_SEC;
1052 sig->rlim[RLIMIT_RTTIME].rlim_cur += 990 sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
1053 USEC_PER_SEC;
1054 } 991 }
1055 printk(KERN_INFO 992 printk(KERN_INFO
1056 "RT Watchdog Timeout: %s[%d]\n", 993 "RT Watchdog Timeout: %s[%d]\n",
@@ -1060,14 +997,11 @@ static void check_thread_timers(struct task_struct *tsk,
1060 } 997 }
1061} 998}
1062 999
1063static void stop_process_timers(struct task_struct *tsk) 1000static void stop_process_timers(struct signal_struct *sig)
1064{ 1001{
1065 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1066 unsigned long flags; 1003 unsigned long flags;
1067 1004
1068 if (!cputimer->running)
1069 return;
1070
1071 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1072 cputimer->running = 0; 1006 cputimer->running = 0;
1073 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
@@ -1107,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1107 } 1041 }
1108} 1042}
1109 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1110/* 1061/*
1111 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1112 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1121,19 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
1121 unsigned long long sum_sched_runtime, sched_expires; 1072 unsigned long long sum_sched_runtime, sched_expires;
1122 struct list_head *timers = sig->cpu_timers; 1073 struct list_head *timers = sig->cpu_timers;
1123 struct task_cputime cputime; 1074 struct task_cputime cputime;
1124 1075 unsigned long soft;
1125 /*
1126 * Don't sample the current process CPU clocks if there are no timers.
1127 */
1128 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1129 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1130 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1131 list_empty(&timers[CPUCLOCK_VIRT]) &&
1132 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1133 list_empty(&timers[CPUCLOCK_SCHED])) {
1134 stop_process_timers(tsk);
1135 return;
1136 }
1137 1076
1138 /* 1077 /*
1139 * Collect the current process totals. 1078 * Collect the current process totals.
@@ -1193,11 +1132,13 @@ static void check_process_timers(struct task_struct *tsk,
1193 SIGPROF); 1132 SIGPROF);
1194 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, 1133 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1195 SIGVTALRM); 1134 SIGVTALRM);
1196 1135 soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1197 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1136 if (soft != RLIM_INFINITY) {
1198 unsigned long psecs = cputime_to_secs(ptime); 1137 unsigned long psecs = cputime_to_secs(ptime);
1138 unsigned long hard =
1139 ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
1199 cputime_t x; 1140 cputime_t x;
1200 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { 1141 if (psecs >= hard) {
1201 /* 1142 /*
1202 * At the hard limit, we just die. 1143 * At the hard limit, we just die.
1203 * No need to calculate anything else now. 1144 * No need to calculate anything else now.
@@ -1205,35 +1146,28 @@ static void check_process_timers(struct task_struct *tsk,
1205 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); 1146 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1206 return; 1147 return;
1207 } 1148 }
1208 if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { 1149 if (psecs >= soft) {
1209 /* 1150 /*
1210 * At the soft limit, send a SIGXCPU every second. 1151 * At the soft limit, send a SIGXCPU every second.
1211 */ 1152 */
1212 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1153 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1213 if (sig->rlim[RLIMIT_CPU].rlim_cur 1154 if (soft < hard) {
1214 < sig->rlim[RLIMIT_CPU].rlim_max) { 1155 soft++;
1215 sig->rlim[RLIMIT_CPU].rlim_cur++; 1156 sig->rlim[RLIMIT_CPU].rlim_cur = soft;
1216 } 1157 }
1217 } 1158 }
1218 x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 1159 x = secs_to_cputime(soft);
1219 if (cputime_eq(prof_expires, cputime_zero) || 1160 if (cputime_eq(prof_expires, cputime_zero) ||
1220 cputime_lt(x, prof_expires)) { 1161 cputime_lt(x, prof_expires)) {
1221 prof_expires = x; 1162 prof_expires = x;
1222 } 1163 }
1223 } 1164 }
1224 1165
1225 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1226 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1227 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1228 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1229 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1230 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1231 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1232 sig->cputime_expires.virt_exp = virt_expires;
1233 if (sched_expires != 0 &&
1234 (sig->cputime_expires.sched_exp == 0 ||
1235 sig->cputime_expires.sched_exp > sched_expires))
1236 sig->cputime_expires.sched_exp = sched_expires;
1237} 1171}
1238 1172
1239/* 1173/*
@@ -1262,9 +1196,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1262 goto out; 1196 goto out;
1263 } 1197 }
1264 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1265 } else { 1200 } else {
1266 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1267 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->sighand == NULL)) {
1268 /* 1203 /*
1269 * The process has been reaped. 1204 * The process has been reaped.
1270 * We can't even collect a sample any more. 1205 * We can't even collect a sample any more.
@@ -1282,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1282 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1283 goto out_unlock; 1218 goto out_unlock;
1284 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1285 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1286 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1287 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1290,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 /* 1226 /*
1291 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1292 */ 1228 */
1293 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1294 1232
1295out_unlock: 1233out_unlock:
1296 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1302,23 +1240,6 @@ out:
1302} 1240}
1303 1241
1304/** 1242/**
1305 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1306 *
1307 * @cputime: The struct to compare.
1308 *
1309 * Checks @cputime to see if all fields are zero. Returns true if all fields
1310 * are zero, false if any field is nonzero.
1311 */
1312static inline int task_cputime_zero(const struct task_cputime *cputime)
1313{
1314 if (cputime_eq(cputime->utime, cputime_zero) &&
1315 cputime_eq(cputime->stime, cputime_zero) &&
1316 cputime->sum_exec_runtime == 0)
1317 return 1;
1318 return 0;
1319}
1320
1321/**
1322 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1323 * 1244 *
1324 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1374,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1374 } 1295 }
1375 1296
1376 sig = tsk->signal; 1297 sig = tsk->signal;
1377 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1378 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1379 1300
1380 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1382,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 return 1; 1303 return 1;
1383 } 1304 }
1384 1305
1385 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1386} 1307}
1387 1308
1388/* 1309/*
@@ -1411,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1411 * put them on the firing list. 1332 * put them on the firing list.
1412 */ 1333 */
1413 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1414 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1415 1341
1416 /* 1342 /*
1417 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1448,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1448} 1374}
1449 1375
1450/* 1376/*
1451 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1452 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1453 * The *newval argument is relative and we update it to be absolute, *oldval
1454 * is absolute and we update it to be relative.
1455 */ 1379 */
1456void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1458{ 1382{
1459 union cpu_time_count now; 1383 union cpu_time_count now;
1460 struct list_head *head;
1461 1384
1462 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1463 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1464 1387
1465 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1466 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1467 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1468 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1475,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1475 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1476 return; 1404 return;
1477 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1478
1479 /*
1480 * If the RLIMIT_CPU timer will expire before the
1481 * ITIMER_PROF timer, we have nothing else to do.
1482 */
1483 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1484 < cputime_to_secs(*newval))
1485 return;
1486 } 1406 }
1487 1407
1488 /* 1408 /*
1489 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1490 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1491 */ 1411 */
1492 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1493 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1494 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1495 struct cpu_timer_list, entry)->expires.cpu,
1496 *newval)) {
1497 switch (clock_idx) {
1498 case CPUCLOCK_PROF:
1499 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1500 break; 1416 break;
1501 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1502 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1503 break; 1420 break;
1504 }
1505 } 1421 }
1506} 1422}
1507 1423
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
559 new_timer->it_id = (timer_t) new_timer_id; 559 new_timer->it_id = (timer_t) new_timer_id;
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
563 if (error)
564 goto out;
565 562
566 /*
567 * return the timer_id now. The next step is hard to
568 * back out if there is an error.
569 */
570 if (copy_to_user(created_timer_id, 563 if (copy_to_user(created_timer_id,
571 &new_timer_id, sizeof (new_timer_id))) { 564 &new_timer_id, sizeof (new_timer_id))) {
572 error = -EFAULT; 565 error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 new_timer->sigq->info.si_tid = new_timer->it_id; 590 new_timer->sigq->info.si_tid = new_timer->it_id;
598 new_timer->sigq->info.si_code = SI_TIMER; 591 new_timer->sigq->info.si_code = SI_TIMER;
599 592
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error)
595 goto out;
596
600 spin_lock_irq(&current->sighand->siglock); 597 spin_lock_irq(&current->sighand->siglock);
601 new_timer->it_signal = current->signal; 598 new_timer->it_signal = current->signal;
602 list_add(&new_timer->list, &current->signal->posix_timers); 599 list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,9 +94,18 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
102config SUSPEND_NVS
103 bool
104
88config SUSPEND 105config SUSPEND
89 bool "Suspend to RAM and standby" 106 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 107 depends on PM && ARCH_SUSPEND_POSSIBLE
108 select SUSPEND_NVS if HAS_IOMEM
91 default y 109 default y
92 ---help--- 110 ---help---
93 Allow the system to enter sleep states in which main memory is 111 Allow the system to enter sleep states in which main memory is
@@ -116,13 +134,10 @@ config SUSPEND_FREEZER
116 134
117 Turning OFF this setting is NOT recommended! If in doubt, say Y. 135 Turning OFF this setting is NOT recommended! If in doubt, say Y.
118 136
119config HIBERNATION_NVS
120 bool
121
122config HIBERNATION 137config HIBERNATION
123 bool "Hibernation (aka 'suspend to disk')" 138 bool "Hibernation (aka 'suspend to disk')"
124 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
125 select HIBERNATION_NVS if HAS_IOMEM 140 select SUSPEND_NVS if HAS_IOMEM
126 ---help--- 141 ---help---
127 Enable the suspend to disk (STD) functionality, which is usually 142 Enable the suspend to disk (STD) functionality, which is usually
128 called "hibernation" in user interfaces. STD checkpoints the 143 called "hibernation" in user interfaces. STD checkpoints the
@@ -222,3 +237,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 237 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 238 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 239 wake-up events.
240
241config PM_OPS
242 bool
243 depends on PM_SLEEP || PM_RUNTIME
244 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h>
25#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
26#include <asm/suspend.h> 27#include <asm/suspend.h>
27 28
@@ -323,6 +324,7 @@ static int create_image(int platform_mode)
323int hibernation_snapshot(int platform_mode) 324int hibernation_snapshot(int platform_mode)
324{ 325{
325 int error; 326 int error;
327 gfp_t saved_mask;
326 328
327 error = platform_begin(platform_mode); 329 error = platform_begin(platform_mode);
328 if (error) 330 if (error)
@@ -334,6 +336,7 @@ int hibernation_snapshot(int platform_mode)
334 goto Close; 336 goto Close;
335 337
336 suspend_console(); 338 suspend_console();
339 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
337 error = dpm_suspend_start(PMSG_FREEZE); 340 error = dpm_suspend_start(PMSG_FREEZE);
338 if (error) 341 if (error)
339 goto Recover_platform; 342 goto Recover_platform;
@@ -351,6 +354,7 @@ int hibernation_snapshot(int platform_mode)
351 354
352 dpm_resume_end(in_suspend ? 355 dpm_resume_end(in_suspend ?
353 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 356 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
357 set_gfp_allowed_mask(saved_mask);
354 resume_console(); 358 resume_console();
355 Close: 359 Close:
356 platform_end(platform_mode); 360 platform_end(platform_mode);
@@ -445,14 +449,17 @@ static int resume_target_kernel(bool platform_mode)
445int hibernation_restore(int platform_mode) 449int hibernation_restore(int platform_mode)
446{ 450{
447 int error; 451 int error;
452 gfp_t saved_mask;
448 453
449 pm_prepare_console(); 454 pm_prepare_console();
450 suspend_console(); 455 suspend_console();
456 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
451 error = dpm_suspend_start(PMSG_QUIESCE); 457 error = dpm_suspend_start(PMSG_QUIESCE);
452 if (!error) { 458 if (!error) {
453 error = resume_target_kernel(platform_mode); 459 error = resume_target_kernel(platform_mode);
454 dpm_resume_end(PMSG_RECOVER); 460 dpm_resume_end(PMSG_RECOVER);
455 } 461 }
462 set_gfp_allowed_mask(saved_mask);
456 resume_console(); 463 resume_console();
457 pm_restore_console(); 464 pm_restore_console();
458 return error; 465 return error;
@@ -466,6 +473,7 @@ int hibernation_restore(int platform_mode)
466int hibernation_platform_enter(void) 473int hibernation_platform_enter(void)
467{ 474{
468 int error; 475 int error;
476 gfp_t saved_mask;
469 477
470 if (!hibernation_ops) 478 if (!hibernation_ops)
471 return -ENOSYS; 479 return -ENOSYS;
@@ -481,6 +489,7 @@ int hibernation_platform_enter(void)
481 489
482 entering_platform_hibernation = true; 490 entering_platform_hibernation = true;
483 suspend_console(); 491 suspend_console();
492 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
484 error = dpm_suspend_start(PMSG_HIBERNATE); 493 error = dpm_suspend_start(PMSG_HIBERNATE);
485 if (error) { 494 if (error) {
486 if (hibernation_ops->recover) 495 if (hibernation_ops->recover)
@@ -518,6 +527,7 @@ int hibernation_platform_enter(void)
518 Resume_devices: 527 Resume_devices:
519 entering_platform_hibernation = false; 528 entering_platform_hibernation = false;
520 dpm_resume_end(PMSG_RESTORE); 529 dpm_resume_end(PMSG_RESTORE);
530 set_gfp_allowed_mask(saved_mask);
521 resume_console(); 531 resume_console();
522 532
523 Close: 533 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index 39ac698ef836..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -10,11 +10,12 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
13#include <linux/suspend.h> 14#include <linux/suspend.h>
14 15
15/* 16/*
16 * Platforms, like ACPI, may want us to save some memory used by them during 17 * Platforms, like ACPI, may want us to save some memory used by them during
17 * hibernation and to restore the contents of this memory during the subsequent 18 * suspend and to restore the contents of this memory during the subsequent
18 * resume. The code below implements a mechanism allowing us to do that. 19 * resume. The code below implements a mechanism allowing us to do that.
19 */ 20 */
20 21
@@ -29,7 +30,7 @@ struct nvs_page {
29static LIST_HEAD(nvs_list); 30static LIST_HEAD(nvs_list);
30 31
31/** 32/**
32 * hibernate_nvs_register - register platform NVS memory region to save 33 * suspend_nvs_register - register platform NVS memory region to save
33 * @start - physical address of the region 34 * @start - physical address of the region
34 * @size - size of the region 35 * @size - size of the region
35 * 36 *
@@ -37,7 +38,7 @@ static LIST_HEAD(nvs_list);
37 * things so that the data from page-aligned addresses in this region will 38 * things so that the data from page-aligned addresses in this region will
38 * be copied into separate RAM pages. 39 * be copied into separate RAM pages.
39 */ 40 */
40int hibernate_nvs_register(unsigned long start, unsigned long size) 41int suspend_nvs_register(unsigned long start, unsigned long size)
41{ 42{
42 struct nvs_page *entry, *next; 43 struct nvs_page *entry, *next;
43 44
@@ -67,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
67} 68}
68 69
69/** 70/**
70 * hibernate_nvs_free - free data pages allocated for saving NVS regions 71 * suspend_nvs_free - free data pages allocated for saving NVS regions
71 */ 72 */
72void hibernate_nvs_free(void) 73void suspend_nvs_free(void)
73{ 74{
74 struct nvs_page *entry; 75 struct nvs_page *entry;
75 76
@@ -85,16 +86,16 @@ void hibernate_nvs_free(void)
85} 86}
86 87
87/** 88/**
88 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions 89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
89 */ 90 */
90int hibernate_nvs_alloc(void) 91int suspend_nvs_alloc(void)
91{ 92{
92 struct nvs_page *entry; 93 struct nvs_page *entry;
93 94
94 list_for_each_entry(entry, &nvs_list, node) { 95 list_for_each_entry(entry, &nvs_list, node) {
95 entry->data = (void *)__get_free_page(GFP_KERNEL); 96 entry->data = (void *)__get_free_page(GFP_KERNEL);
96 if (!entry->data) { 97 if (!entry->data) {
97 hibernate_nvs_free(); 98 suspend_nvs_free();
98 return -ENOMEM; 99 return -ENOMEM;
99 } 100 }
100 } 101 }
@@ -102,9 +103,9 @@ int hibernate_nvs_alloc(void)
102} 103}
103 104
104/** 105/**
105 * hibernate_nvs_save - save NVS memory regions 106 * suspend_nvs_save - save NVS memory regions
106 */ 107 */
107void hibernate_nvs_save(void) 108void suspend_nvs_save(void)
108{ 109{
109 struct nvs_page *entry; 110 struct nvs_page *entry;
110 111
@@ -118,12 +119,12 @@ void hibernate_nvs_save(void)
118} 119}
119 120
120/** 121/**
121 * hibernate_nvs_restore - restore NVS memory regions 122 * suspend_nvs_restore - restore NVS memory regions
122 * 123 *
123 * This function is going to be called with interrupts disabled, so it 124 * This function is going to be called with interrupts disabled, so it
124 * cannot iounmap the virtual addresses used to access the NVS region. 125 * cannot iounmap the virtual addresses used to access the NVS region.
125 */ 126 */
126void hibernate_nvs_restore(void) 127void suspend_nvs_restore(void)
127{ 128{
128 struct nvs_page *entry; 129 struct nvs_page *entry;
129 130
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 89 "(%d tasks refusing to freeze):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 90 elapsed_csecs / 100, elapsed_csecs % 100, todo);
91 show_state();
92 read_lock(&tasklist_lock); 91 read_lock(&tasklist_lock);
93 do_each_thread(g, p) { 92 do_each_thread(g, p) {
94 task_lock(p); 93 task_lock(p);
95 if (freezing(p) && !freezer_should_skip(p)) 94 if (freezing(p) && !freezer_should_skip(p))
96 printk(KERN_ERR " %s\n", p->comm); 95 sched_show_task(p);
97 cancel_freezing(p); 96 cancel_freezing(p);
98 task_unlock(p); 97 task_unlock(p);
99 } while_each_thread(g, p); 98 } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
145 if (nosig_only && should_send_signal(p)) 144 if (nosig_only && should_send_signal(p))
146 continue; 145 continue;
147 146
148 if (cgroup_frozen(p)) 147 if (cgroup_freezing_or_frozen(p))
149 continue; 148 continue;
150 149
151 thaw_process(p); 150 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/mmu_context.h> 32#include <asm/mmu_context.h>
@@ -1181,7 +1182,7 @@ static void free_unnecessary_pages(void)
1181 1182
1182 memory_bm_position_reset(&copy_bm); 1183 memory_bm_position_reset(&copy_bm);
1183 1184
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1185 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1186 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1187 struct page *page = pfn_to_page(pfn);
1187 1188
@@ -1500,7 +1501,7 @@ asmlinkage int swsusp_save(void)
1500{ 1501{
1501 unsigned int nr_pages, nr_highmem; 1502 unsigned int nr_pages, nr_highmem;
1502 1503
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1504 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1505
1505 drain_local_pages(NULL); 1506 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1507 nr_pages = count_data_pages();
@@ -1603,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1603 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1604 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1605 * 1606 *
1606 * The @count parameter should contain the number of bytes the caller
1607 * wants to read from the snapshot. It must not be zero.
1608 *
1609 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1610 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1611 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1612 * may be smaller than @count, but this only happens if the read would
1613 * cross a page boundary otherwise.
1614 * 1610 *
1615 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1616 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1618,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1618 * any more. 1614 * any more.
1619 */ 1615 */
1620 1616
1621int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1622{ 1618{
1623 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1624 return 0; 1620 return 0;
@@ -1629,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1629 if (!buffer) 1625 if (!buffer)
1630 return -ENOMEM; 1626 return -ENOMEM;
1631 } 1627 }
1632 if (!handle->offset) { 1628 if (!handle->cur) {
1633 int error; 1629 int error;
1634 1630
1635 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1638,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1638 handle->buffer = buffer; 1634 handle->buffer = buffer;
1639 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1640 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1641 } 1637 } else if (handle->cur <= nr_meta_pages) {
1642 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1643 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1644 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1645 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1646 } else {
1647 struct page *page;
1648 1642
1649 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1650 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1651 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1652 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1653 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1654 */ 1648 */
1655 void *kaddr; 1649 void *kaddr;
1656 1650
1657 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1658 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1659 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1660 handle->buffer = buffer; 1654 handle->buffer = buffer;
1661 } else { 1655 } else {
1662 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1663 }
1664 } 1657 }
1665 handle->prev = handle->cur;
1666 }
1667 handle->buf_offset = handle->cur_offset;
1668 if (handle->cur_offset + count >= PAGE_SIZE) {
1669 count = PAGE_SIZE - handle->cur_offset;
1670 handle->cur_offset = 0;
1671 handle->cur++;
1672 } else {
1673 handle->cur_offset += count;
1674 } 1658 }
1675 handle->offset += count; 1659 handle->cur++;
1676 return count; 1660 return PAGE_SIZE;
1677} 1661}
1678 1662
1679/** 1663/**
@@ -2132,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2132 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2133 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2134 * 2118 *
2135 * The @count parameter should contain the number of bytes the caller
2136 * wants to write to the image. It must not be zero.
2137 *
2138 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2139 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2140 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2141 * may be smaller than @count, but this only happens if the write would
2142 * cross a page boundary otherwise.
2143 * 2122 *
2144 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2145 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2147,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2147 * any more. 2126 * any more.
2148 */ 2127 */
2149 2128
2150int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2151{ 2130{
2152 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2153 int error = 0; 2132 int error = 0;
2154 2133
2155 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2156 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2157 return 0; 2136 return 0;
2158 2137
2159 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2160 if (!buffer) 2141 if (!buffer)
2161 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2162 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2165,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2165 return -ENOMEM; 2146 return -ENOMEM;
2166 2147
2167 handle->buffer = buffer; 2148 handle->buffer = buffer;
2168 } 2149 } else if (handle->cur == 1) {
2169 handle->sync_read = 1; 2150 error = load_header(buffer);
2170 if (handle->prev < handle->cur) { 2151 if (error)
2171 if (handle->prev == 0) { 2152 return error;
2172 error = load_header(buffer);
2173 if (error)
2174 return error;
2175 2153
2176 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2177 if (error) 2155 if (error)
2178 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2179 2162
2180 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2181 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2182 if (error) 2165 if (error)
2183 return error; 2166 return error;
2184 2167
2185 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2186 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2187 if (error) 2170 restore_pblist = NULL;
2188 return error;
2189
2190 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2191 memory_bm_position_reset(&orig_bm);
2192 restore_pblist = NULL;
2193 handle->buffer = get_buffer(&orig_bm, &ca);
2194 handle->sync_read = 0;
2195 if (IS_ERR(handle->buffer))
2196 return PTR_ERR(handle->buffer);
2197 }
2198 } else {
2199 copy_last_highmem_page();
2200 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2201 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2202 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2203 if (handle->buffer != buffer)
2204 handle->sync_read = 0;
2205 } 2175 }
2206 handle->prev = handle->cur;
2207 }
2208 handle->buf_offset = handle->cur_offset;
2209 if (handle->cur_offset + count >= PAGE_SIZE) {
2210 count = PAGE_SIZE - handle->cur_offset;
2211 handle->cur_offset = 0;
2212 handle->cur++;
2213 } else { 2176 } else {
2214 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2215 } 2183 }
2216 handle->offset += count; 2184 handle->cur++;
2217 return count; 2185 return PAGE_SIZE;
2218} 2186}
2219 2187
2220/** 2188/**
@@ -2229,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2229{ 2197{
2230 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2231 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2232 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2233 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2234 free_highmem_data(); 2202 free_highmem_data();
2235 } 2203 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..f37cb7dd4402 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,13 @@
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h>
19#include <linux/io.h>
20#include <linux/kernel.h>
21#include <linux/list.h>
22#include <linux/mm.h>
23#include <linux/slab.h>
24#include <linux/suspend.h>
18 25
19#include "power.h" 26#include "power.h"
20 27
@@ -189,6 +196,7 @@ static int suspend_enter(suspend_state_t state)
189int suspend_devices_and_enter(suspend_state_t state) 196int suspend_devices_and_enter(suspend_state_t state)
190{ 197{
191 int error; 198 int error;
199 gfp_t saved_mask;
192 200
193 if (!suspend_ops) 201 if (!suspend_ops)
194 return -ENOSYS; 202 return -ENOSYS;
@@ -199,6 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state)
199 goto Close; 207 goto Close;
200 } 208 }
201 suspend_console(); 209 suspend_console();
210 saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
202 suspend_test_start(); 211 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND); 212 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) { 213 if (error) {
@@ -215,6 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state)
215 suspend_test_start(); 224 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME); 225 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices"); 226 suspend_test_finish("resume devices");
227 set_gfp_allowed_mask(saved_mask);
218 resume_console(); 228 resume_console();
219 Close: 229 Close:
220 if (suspend_ops->end) 230 if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,11 +23,46 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h>
26 27
27#include "power.h" 28#include "power.h"
28 29
29#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
30 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
31struct swsusp_header { 66struct swsusp_header {
32 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
33 sector_t image; 68 sector_t image;
@@ -144,110 +179,24 @@ int swsusp_swap_in_use(void)
144 */ 179 */
145 180
146static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
147static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
148
149/**
150 * submit - submit BIO request.
151 * @rw: READ or WRITE.
152 * @off physical offset of page.
153 * @page: page we're reading or writing.
154 * @bio_chain: list of pending biod (for async reading)
155 *
156 * Straight from the textbook - allocate and initialize the bio.
157 * If we're reading, make sure the page is marked as dirty.
158 * Then submit it and, if @bio_chain == NULL, wait.
159 */
160static int submit(int rw, pgoff_t page_off, struct page *page,
161 struct bio **bio_chain)
162{
163 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
164 struct bio *bio;
165
166 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
167 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
168 bio->bi_bdev = resume_bdev;
169 bio->bi_end_io = end_swap_bio_read;
170
171 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
172 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
173 page_off);
174 bio_put(bio);
175 return -EFAULT;
176 }
177
178 lock_page(page);
179 bio_get(bio);
180
181 if (bio_chain == NULL) {
182 submit_bio(bio_rw, bio);
183 wait_on_page_locked(page);
184 if (rw == READ)
185 bio_set_pages_dirty(bio);
186 bio_put(bio);
187 } else {
188 if (rw == READ)
189 get_page(page); /* These pages are freed later */
190 bio->bi_private = *bio_chain;
191 *bio_chain = bio;
192 submit_bio(bio_rw, bio);
193 }
194 return 0;
195}
196
197static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
198{
199 return submit(READ, page_off, virt_to_page(addr), bio_chain);
200}
201
202static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
203{
204 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
205}
206
207static int wait_on_bio_chain(struct bio **bio_chain)
208{
209 struct bio *bio;
210 struct bio *next_bio;
211 int ret = 0;
212
213 if (bio_chain == NULL)
214 return 0;
215
216 bio = *bio_chain;
217 if (bio == NULL)
218 return 0;
219 while (bio) {
220 struct page *page;
221
222 next_bio = bio->bi_private;
223 page = bio->bi_io_vec[0].bv_page;
224 wait_on_page_locked(page);
225 if (!PageUptodate(page) || PageError(page))
226 ret = -EIO;
227 put_page(page);
228 bio_put(bio);
229 bio = next_bio;
230 }
231 *bio_chain = NULL;
232 return ret;
233}
234 183
235/* 184/*
236 * Saving part 185 * Saving part
237 */ 186 */
238 187
239static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
240{ 189{
241 int error; 190 int error;
242 191
243 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
244 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
245 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
246 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
247 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
248 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
249 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
250 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
251 swsusp_header, NULL); 200 swsusp_header, NULL);
252 } else { 201 } else {
253 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -259,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
259/** 208/**
260 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
261 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
262 */ 213 */
263 214static int swsusp_swap_check(void)
264static int swsusp_swap_check(void) /* This is called before saving image */
265{ 215{
266 int res; 216 int res;
267 217
268 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
269 &resume_bdev); 219 &hib_resume_bdev);
270 if (res < 0) 220 if (res < 0)
271 return res; 221 return res;
272 222
273 root_swap = res; 223 root_swap = res;
274 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
275 if (res) 225 if (res)
276 return res; 226 return res;
277 227
278 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
279 if (res < 0) 229 if (res < 0)
280 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
281 231
282 return res; 232 return res;
283} 233}
@@ -308,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
308 } else { 258 } else {
309 src = buf; 259 src = buf;
310 } 260 }
311 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
312} 262}
313 263
314/*
315 * The swap map is a data structure used for keeping track of each page
316 * written to a swap partition. It consists of many swap_map_page
317 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
318 * These structures are stored on the swap and linked together with the
319 * help of the .next_swap member.
320 *
321 * The swap map is created during suspend. The swap map pages are
322 * allocated and populated one at a time, so we only need one memory
323 * page to set up the entire structure.
324 *
325 * During resume we also only need to use one swap_map_page structure
326 * at a time.
327 */
328
329#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
330
331struct swap_map_page {
332 sector_t entries[MAP_PAGE_ENTRIES];
333 sector_t next_swap;
334};
335
336/**
337 * The swap_map_handle structure is used for handling swap in
338 * a file-alike way
339 */
340
341struct swap_map_handle {
342 struct swap_map_page *cur;
343 sector_t cur_swap;
344 unsigned int k;
345};
346
347static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
348{ 265{
349 if (handle->cur) 266 if (handle->cur)
@@ -353,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
353 270
354static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
355{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
356 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
357 if (!handle->cur) 283 if (!handle->cur) {
358 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
359 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
360 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
361 release_swap_writer(handle); 289 ret = -ENOSPC;
362 return -ENOSPC; 290 goto err_rel;
363 } 291 }
364 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
365 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
366} 300}
367 301
368static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -379,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
379 return error; 313 return error;
380 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
381 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
382 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
383 if (error) 317 if (error)
384 goto out; 318 goto out;
385 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -405,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
405 return -EINVAL; 339 return -EINVAL;
406} 340}
407 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
408/** 360/**
409 * save_image - save the suspend image data 361 * save_image - save the suspend image data
410 */ 362 */
@@ -430,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
430 bio = NULL; 382 bio = NULL;
431 do_gettimeofday(&start); 383 do_gettimeofday(&start);
432 while (1) { 384 while (1) {
433 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
434 if (ret <= 0) 386 if (ret <= 0)
435 break; 387 break;
436 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -440,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
441 nr_pages++; 393 nr_pages++;
442 } 394 }
443 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
444 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
445 if (!ret) 397 if (!ret)
446 ret = err2; 398 ret = err2;
@@ -482,50 +434,34 @@ int swsusp_write(unsigned int flags)
482 struct swap_map_handle handle; 434 struct swap_map_handle handle;
483 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
484 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
485 int error; 438 int error;
486 439
487 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
488 if (error) { 442 if (error) {
489 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
490 "swapon -a.\n");
491 return error; 444 return error;
492 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
493 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
494 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
495 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
496 if (error >= 0) 454 if (error >= 0)
497 error = -EFAULT; 455 error = -EFAULT;
498 456
499 goto out; 457 goto out_finish;
500 } 458 }
501 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
502 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
503 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
504 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
505 goto out; 463out_finish:
506 } 464 error = swap_writer_finish(&handle, flags, error);
507 error = get_swap_writer(&handle);
508 if (!error) {
509 sector_t start = handle.cur_swap;
510
511 error = swap_write_page(&handle, header, NULL);
512 if (!error)
513 error = save_image(&handle, &snapshot,
514 header->pages - 1);
515
516 if (!error) {
517 flush_swap_writer(&handle);
518 printk(KERN_INFO "PM: S");
519 error = mark_swapfiles(start, flags);
520 printk("|\n");
521 }
522 }
523 if (error)
524 free_all_swap_pages(root_swap);
525
526 release_swap_writer(&handle);
527 out:
528 swsusp_close(FMODE_WRITE);
529 return error; 465 return error;
530} 466}
531 467
@@ -541,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
541 handle->cur = NULL; 477 handle->cur = NULL;
542} 478}
543 479
544static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
545{ 482{
546 int error; 483 int error;
547 484
548 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
549 return -EINVAL; 488 return -EINVAL;
550 489
551 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
552 if (!handle->cur) 491 if (!handle->cur)
553 return -ENOMEM; 492 return -ENOMEM;
554 493
555 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
556 if (error) { 495 if (error) {
557 release_swap_reader(handle); 496 release_swap_reader(handle);
558 return error; 497 return error;
@@ -572,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
572 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
573 if (!offset) 512 if (!offset)
574 return -EFAULT; 513 return -EFAULT;
575 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
576 if (error) 515 if (error)
577 return error; 516 return error;
578 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
579 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
580 handle->k = 0; 519 handle->k = 0;
581 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
582 if (!offset) 521 if (!offset)
583 release_swap_reader(handle); 522 release_swap_reader(handle);
584 else if (!error) 523 else if (!error)
585 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
586 } 525 }
587 return error; 526 return error;
588} 527}
589 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
590/** 536/**
591 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
592 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -614,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
614 bio = NULL; 560 bio = NULL;
615 do_gettimeofday(&start); 561 do_gettimeofday(&start);
616 for ( ; ; ) { 562 for ( ; ; ) {
617 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
618 if (error <= 0) 564 if (error <= 0)
619 break; 565 break;
620 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
621 if (error) 567 if (error)
622 break; 568 break;
623 if (snapshot->sync_read) 569 if (snapshot->sync_read)
624 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
625 if (error) 571 if (error)
626 break; 572 break;
627 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
628 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
629 nr_pages++; 575 nr_pages++;
630 } 576 }
631 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
632 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
633 if (!error) 579 if (!error)
634 error = err2; 580 error = err2;
@@ -656,24 +602,20 @@ int swsusp_read(unsigned int *flags_p)
656 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
657 struct swsusp_info *header; 603 struct swsusp_info *header;
658 604
659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
667 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
668 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
669 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
670 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
671 if (!error) 613 if (!error)
672 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
673 if (!error) 615 if (!error)
674 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
675 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
676 618end:
677 if (!error) 619 if (!error)
678 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
679 else 621 else
@@ -689,11 +631,11 @@ int swsusp_check(void)
689{ 631{
690 int error; 632 int error;
691 633
692 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
693 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
694 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
695 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
696 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
697 swsusp_header, NULL); 639 swsusp_header, NULL);
698 if (error) 640 if (error)
699 goto put; 641 goto put;
@@ -701,7 +643,7 @@ int swsusp_check(void)
701 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
702 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
703 /* Reset swap signature now */ 645 /* Reset swap signature now */
704 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
705 swsusp_header, NULL); 647 swsusp_header, NULL);
706 } else { 648 } else {
707 error = -EINVAL; 649 error = -EINVAL;
@@ -709,11 +651,11 @@ int swsusp_check(void)
709 651
710put: 652put:
711 if (error) 653 if (error)
712 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
713 else 655 else
714 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
715 } else { 657 } else {
716 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
717 } 659 }
718 660
719 if (error) 661 if (error)
@@ -728,12 +670,12 @@ put:
728 670
729void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
730{ 672{
731 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
732 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
733 return; 675 return;
734 } 676 }
735 677
736 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
737} 679}
738 680
739static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,23 +184,39 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
196} 209}
197 210
211static void snapshot_deprecated_ioctl(unsigned int cmd)
212{
213 if (printk_ratelimit())
214 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
215 "be removed soon, update your suspend-to-disk "
216 "utilities\n",
217 __builtin_return_address(0), cmd);
218}
219
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 220static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 221 unsigned long arg)
200{ 222{
@@ -246,8 +268,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 268 data->frozen = 0;
247 break; 269 break;
248 270
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 271 case SNAPSHOT_ATOMIC_SNAPSHOT:
272 snapshot_deprecated_ioctl(cmd);
273 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 274 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 275 error = -EPERM;
253 break; 276 break;
@@ -275,8 +298,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 298 data->ready = 0;
276 break; 299 break;
277 300
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 301 case SNAPSHOT_SET_IMAGE_SIZE:
302 snapshot_deprecated_ioctl(cmd);
303 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 304 image_size = arg;
281 break; 305 break;
282 306
@@ -290,15 +314,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 314 error = put_user(size, (loff_t __user *)arg);
291 break; 315 break;
292 316
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 317 case SNAPSHOT_AVAIL_SWAP:
318 snapshot_deprecated_ioctl(cmd);
319 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 320 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 321 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 322 error = put_user(size, (loff_t __user *)arg);
298 break; 323 break;
299 324
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 325 case SNAPSHOT_GET_SWAP_PAGE:
326 snapshot_deprecated_ioctl(cmd);
327 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 328 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 329 error = -ENODEV;
304 break; 330 break;
@@ -321,6 +347,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 347 break;
322 348
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 349 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
350 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 351 if (!swsusp_swap_in_use()) {
325 /* 352 /*
326 * User space encodes device types as two-byte values, 353 * User space encodes device types as two-byte values,
@@ -362,6 +389,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 389 break;
363 390
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 391 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
392 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 393 error = -EINVAL;
366 394
367 switch (arg) { 395 switch (arg) {
@@ -405,7 +433,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
405 * User space encodes device types as two-byte values, 433 * User space encodes device types as two-byte values,
406 * so we need to recode them 434 * so we need to recode them
407 */ 435 */
408 swdev = old_decode_dev(swap_area.dev); 436 swdev = new_decode_dev(swap_area.dev);
409 if (swdev) { 437 if (swdev) {
410 offset = swap_area.offset; 438 offset = swap_area.offset;
411 data->swap = swap_type_of(swdev, offset, NULL); 439 data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,8 +33,10 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/kdb.h>
36#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
39#include <linux/syslog.h>
38 40
39#include <asm/uaccess.h> 41#include <asm/uaccess.h>
40 42
@@ -69,8 +71,6 @@ int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 71 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
70}; 72};
71 73
72static int saved_console_loglevel = -1;
73
74/* 74/*
75 * Low level drivers may need that to know if they can schedule in 75 * Low level drivers may need that to know if they can schedule in
76 * their unblank() callback or not. So let's export it. 76 * their unblank() callback or not. So let's export it.
@@ -145,6 +145,7 @@ static char __log_buf[__LOG_BUF_LEN];
145static char *log_buf = __log_buf; 145static char *log_buf = __log_buf;
146static int log_buf_len = __LOG_BUF_LEN; 146static int log_buf_len = __LOG_BUF_LEN;
147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ 147static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
148static int saved_console_loglevel = -1;
148 149
149#ifdef CONFIG_KEXEC 150#ifdef CONFIG_KEXEC
150/* 151/*
@@ -258,38 +259,23 @@ static inline void boot_delay_msec(void)
258} 259}
259#endif 260#endif
260 261
261/* 262int do_syslog(int type, char __user *buf, int len, bool from_file)
262 * Commands to do_syslog:
263 *
264 * 0 -- Close the log. Currently a NOP.
265 * 1 -- Open the log. Currently a NOP.
266 * 2 -- Read from the log.
267 * 3 -- Read all messages remaining in the ring buffer.
268 * 4 -- Read and clear all messages remaining in the ring buffer
269 * 5 -- Clear ring buffer.
270 * 6 -- Disable printk's to console
271 * 7 -- Enable printk's to console
272 * 8 -- Set level of messages printed to console
273 * 9 -- Return number of unread characters in the log buffer
274 * 10 -- Return size of the log buffer
275 */
276int do_syslog(int type, char __user *buf, int len)
277{ 263{
278 unsigned i, j, limit, count; 264 unsigned i, j, limit, count;
279 int do_clear = 0; 265 int do_clear = 0;
280 char c; 266 char c;
281 int error = 0; 267 int error = 0;
282 268
283 error = security_syslog(type); 269 error = security_syslog(type, from_file);
284 if (error) 270 if (error)
285 return error; 271 return error;
286 272
287 switch (type) { 273 switch (type) {
288 case 0: /* Close log */ 274 case SYSLOG_ACTION_CLOSE: /* Close log */
289 break; 275 break;
290 case 1: /* Open log */ 276 case SYSLOG_ACTION_OPEN: /* Open log */
291 break; 277 break;
292 case 2: /* Read from log */ 278 case SYSLOG_ACTION_READ: /* Read from log */
293 error = -EINVAL; 279 error = -EINVAL;
294 if (!buf || len < 0) 280 if (!buf || len < 0)
295 goto out; 281 goto out;
@@ -320,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len)
320 if (!error) 306 if (!error)
321 error = i; 307 error = i;
322 break; 308 break;
323 case 4: /* Read/clear last kernel messages */ 309 /* Read/clear last kernel messages */
310 case SYSLOG_ACTION_READ_CLEAR:
324 do_clear = 1; 311 do_clear = 1;
325 /* FALL THRU */ 312 /* FALL THRU */
326 case 3: /* Read last kernel messages */ 313 /* Read last kernel messages */
314 case SYSLOG_ACTION_READ_ALL:
327 error = -EINVAL; 315 error = -EINVAL;
328 if (!buf || len < 0) 316 if (!buf || len < 0)
329 goto out; 317 goto out;
@@ -376,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len)
376 } 364 }
377 } 365 }
378 break; 366 break;
379 case 5: /* Clear ring buffer */ 367 /* Clear ring buffer */
368 case SYSLOG_ACTION_CLEAR:
380 logged_chars = 0; 369 logged_chars = 0;
381 break; 370 break;
382 case 6: /* Disable logging to console */ 371 /* Disable logging to console */
372 case SYSLOG_ACTION_CONSOLE_OFF:
383 if (saved_console_loglevel == -1) 373 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel; 374 saved_console_loglevel = console_loglevel;
385 console_loglevel = minimum_console_loglevel; 375 console_loglevel = minimum_console_loglevel;
386 break; 376 break;
387 case 7: /* Enable logging to console */ 377 /* Enable logging to console */
378 case SYSLOG_ACTION_CONSOLE_ON:
388 if (saved_console_loglevel != -1) { 379 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel; 380 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1; 381 saved_console_loglevel = -1;
391 } 382 }
392 break; 383 break;
393 case 8: /* Set level of messages printed to console */ 384 /* Set level of messages printed to console */
385 case SYSLOG_ACTION_CONSOLE_LEVEL:
394 error = -EINVAL; 386 error = -EINVAL;
395 if (len < 1 || len > 8) 387 if (len < 1 || len > 8)
396 goto out; 388 goto out;
@@ -401,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len)
401 saved_console_loglevel = -1; 393 saved_console_loglevel = -1;
402 error = 0; 394 error = 0;
403 break; 395 break;
404 case 9: /* Number of chars in the log buffer */ 396 /* Number of chars in the log buffer */
397 case SYSLOG_ACTION_SIZE_UNREAD:
405 error = log_end - log_start; 398 error = log_end - log_start;
406 break; 399 break;
407 case 10: /* Size of the log buffer */ 400 /* Size of the log buffer */
401 case SYSLOG_ACTION_SIZE_BUFFER:
408 error = log_buf_len; 402 error = log_buf_len;
409 break; 403 break;
410 default: 404 default:
@@ -417,9 +411,25 @@ out:
417 411
418SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) 412SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
419{ 413{
420 return do_syslog(type, buf, len); 414 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
421} 415}
422 416
417#ifdef CONFIG_KGDB_KDB
418/* kdb dmesg command needs access to the syslog buffer. do_syslog()
419 * uses locks so it cannot be used during debugging. Just tell kdb
420 * where the start and end of the physical and logical logs are. This
421 * is equivalent to do_syslog(3).
422 */
423void kdb_syslog_data(char *syslog_data[4])
424{
425 syslog_data[0] = log_buf;
426 syslog_data[1] = log_buf + log_buf_len;
427 syslog_data[2] = log_buf + log_end -
428 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
429 syslog_data[3] = log_buf + log_end;
430}
431#endif /* CONFIG_KGDB_KDB */
432
423/* 433/*
424 * Call the console drivers on a range of log_buf 434 * Call the console drivers on a range of log_buf
425 */ 435 */
@@ -593,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
593 va_list args; 603 va_list args;
594 int r; 604 int r;
595 605
606#ifdef CONFIG_KGDB_KDB
607 if (unlikely(kdb_trap_printk)) {
608 va_start(args, fmt);
609 r = vkdb_printf(fmt, args);
610 va_end(args);
611 return r;
612 }
613#endif
596 va_start(args, fmt); 614 va_start(args, fmt);
597 r = vprintk(fmt, args); 615 r = vprintk(fmt, args);
598 va_end(args); 616 va_end(args);
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
@@ -363,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
363 switch (action) { 365 switch (action) {
364 case CPU_UP_PREPARE: 366 case CPU_UP_PREPARE:
365 case CPU_UP_PREPARE_FROZEN: 367 case CPU_UP_PREPARE_FROZEN:
366 node = cpu_to_node(cpu); 368 node = cpu_to_mem(cpu);
367 per_cpu(cpu_profile_flip, cpu) = 0; 369 per_cpu(cpu_profile_flip, cpu) = 0;
368 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 370 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
369 page = alloc_pages_exact_node(node, 371 page = alloc_pages_exact_node(node,
370 GFP_KERNEL | __GFP_ZERO, 372 GFP_KERNEL | __GFP_ZERO,
371 0); 373 0);
372 if (!page) 374 if (!page)
373 return NOTIFY_BAD; 375 return notifier_from_errno(-ENOMEM);
374 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
375 } 377 }
376 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 378 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -386,7 +388,7 @@ out_free:
386 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
387 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 389 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
388 __free_page(page); 390 __free_page(page);
389 return NOTIFY_BAD; 391 return notifier_from_errno(-ENOMEM);
390 case CPU_ONLINE: 392 case CPU_ONLINE:
391 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
392 if (prof_cpu_mask != NULL) 394 if (prof_cpu_mask != NULL)
@@ -565,7 +567,7 @@ static int create_hash_tables(void)
565 int cpu; 567 int cpu;
566 568
567 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
568 int node = cpu_to_node(cpu); 570 int node = cpu_to_mem(cpu);
569 struct page *page; 571 struct page *page;
570 572
571 page = alloc_pages_exact_node(node, 573 page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -22,6 +21,7 @@
22#include <linux/pid_namespace.h> 21#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 22#include <linux/syscalls.h>
24#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/regset.h>
25 25
26 26
27/* 27/*
@@ -75,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
75 child->parent = child->real_parent; 75 child->parent = child->real_parent;
76 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
77 77
78 arch_ptrace_untrace(child);
79 if (task_is_traced(child)) 78 if (task_is_traced(child))
80 ptrace_untrace(child); 79 ptrace_untrace(child);
81} 80}
@@ -511,6 +510,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 510 return 0;
512} 511}
513 512
513#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
514
515static const struct user_regset *
516find_regset(const struct user_regset_view *view, unsigned int type)
517{
518 const struct user_regset *regset;
519 int n;
520
521 for (n = 0; n < view->n; ++n) {
522 regset = view->regsets + n;
523 if (regset->core_note_type == type)
524 return regset;
525 }
526
527 return NULL;
528}
529
530static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
531 struct iovec *kiov)
532{
533 const struct user_regset_view *view = task_user_regset_view(task);
534 const struct user_regset *regset = find_regset(view, type);
535 int regset_no;
536
537 if (!regset || (kiov->iov_len % regset->size) != 0)
538 return -EINVAL;
539
540 regset_no = regset - view->regsets;
541 kiov->iov_len = min(kiov->iov_len,
542 (__kernel_size_t) (regset->n * regset->size));
543
544 if (req == PTRACE_GETREGSET)
545 return copy_regset_to_user(task, view, regset_no, 0,
546 kiov->iov_len, kiov->iov_base);
547 else
548 return copy_regset_from_user(task, view, regset_no, 0,
549 kiov->iov_len, kiov->iov_base);
550}
551
552#endif
553
514int ptrace_request(struct task_struct *child, long request, 554int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 555 long addr, long data)
516{ 556{
@@ -554,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
554 ret = ptrace_detach(child, data); 594 ret = ptrace_detach(child, data);
555 break; 595 break;
556 596
597#ifdef CONFIG_BINFMT_ELF_FDPIC
598 case PTRACE_GETFDPIC: {
599 struct mm_struct *mm = get_task_mm(child);
600 unsigned long tmp = 0;
601
602 ret = -ESRCH;
603 if (!mm)
604 break;
605
606 switch (addr) {
607 case PTRACE_GETFDPIC_EXEC:
608 tmp = mm->context.exec_fdpic_loadmap;
609 break;
610 case PTRACE_GETFDPIC_INTERP:
611 tmp = mm->context.interp_fdpic_loadmap;
612 break;
613 default:
614 break;
615 }
616 mmput(mm);
617
618 ret = put_user(tmp, (unsigned long __user *) data);
619 break;
620 }
621#endif
622
557#ifdef PTRACE_SINGLESTEP 623#ifdef PTRACE_SINGLESTEP
558 case PTRACE_SINGLESTEP: 624 case PTRACE_SINGLESTEP:
559#endif 625#endif
@@ -573,6 +639,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 639 return 0;
574 return ptrace_resume(child, request, SIGKILL); 640 return ptrace_resume(child, request, SIGKILL);
575 641
642#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
643 case PTRACE_GETREGSET:
644 case PTRACE_SETREGSET:
645 {
646 struct iovec kiov;
647 struct iovec __user *uiov = (struct iovec __user *) data;
648
649 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
650 return -EFAULT;
651
652 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
653 __get_user(kiov.iov_len, &uiov->iov_len))
654 return -EFAULT;
655
656 ret = ptrace_regset(child, request, addr, &kiov);
657 if (!ret)
658 ret = __put_user(kiov.iov_len, &uiov->iov_len);
659 break;
660 }
661#endif
576 default: 662 default:
577 break; 663 break;
578 } 664 }
@@ -604,10 +690,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
604 struct task_struct *child; 690 struct task_struct *child;
605 long ret; 691 long ret;
606 692
607 /*
608 * This lock_kernel fixes a subtle race with suid exec
609 */
610 lock_kernel();
611 if (request == PTRACE_TRACEME) { 693 if (request == PTRACE_TRACEME) {
612 ret = ptrace_traceme(); 694 ret = ptrace_traceme();
613 if (!ret) 695 if (!ret)
@@ -641,7 +723,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
641 out_put_task_struct: 723 out_put_task_struct:
642 put_task_struct(child); 724 put_task_struct(child);
643 out: 725 out:
644 unlock_kernel();
645 return ret; 726 return ret;
646} 727}
647 728
@@ -711,6 +792,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 792 else
712 ret = ptrace_setsiginfo(child, &siginfo); 793 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 794 break;
795#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
796 case PTRACE_GETREGSET:
797 case PTRACE_SETREGSET:
798 {
799 struct iovec kiov;
800 struct compat_iovec __user *uiov =
801 (struct compat_iovec __user *) datap;
802 compat_uptr_t ptr;
803 compat_size_t len;
804
805 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
806 return -EFAULT;
807
808 if (__get_user(ptr, &uiov->iov_base) ||
809 __get_user(len, &uiov->iov_len))
810 return -EFAULT;
811
812 kiov.iov_base = compat_ptr(ptr);
813 kiov.iov_len = len;
814
815 ret = ptrace_regset(child, request, addr, &kiov);
816 if (!ret)
817 ret = __put_user(kiov.iov_len, &uiov->iov_len);
818 break;
819 }
820#endif
714 821
715 default: 822 default:
716 ret = ptrace_request(child, request, addr, data); 823 ret = ptrace_request(child, request, addr, data);
@@ -725,10 +832,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
725 struct task_struct *child; 832 struct task_struct *child;
726 long ret; 833 long ret;
727 834
728 /*
729 * This lock_kernel fixes a subtle race with suid exec
730 */
731 lock_kernel();
732 if (request == PTRACE_TRACEME) { 835 if (request == PTRACE_TRACEME) {
733 ret = ptrace_traceme(); 836 ret = ptrace_traceme();
734 goto out; 837 goto out;
@@ -758,7 +861,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
758 out_put_task_struct: 861 out_put_task_struct:
759 put_task_struct(child); 862 put_task_struct(child);
760 out: 863 out:
761 unlock_kernel();
762 return ret; 864 return ret;
763} 865}
764#endif /* CONFIG_COMPAT */ 866#endif /* CONFIG_COMPAT */
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
1/*
2 * Range add and subtract
3 */
4#include <linux/module.h>
5#include <linux/init.h>
6#include <linux/sort.h>
7
8#include <linux/range.h>
9
10#ifndef ARRAY_SIZE
11#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
12#endif
13
14int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
15{
16 if (start >= end)
17 return nr_range;
18
19 /* Out of slots: */
20 if (nr_range >= az)
21 return nr_range;
22
23 range[nr_range].start = start;
24 range[nr_range].end = end;
25
26 nr_range++;
27
28 return nr_range;
29}
30
31int add_range_with_merge(struct range *range, int az, int nr_range,
32 u64 start, u64 end)
33{
34 int i;
35
36 if (start >= end)
37 return nr_range;
38
39 /* Try to merge it with old one: */
40 for (i = 0; i < nr_range; i++) {
41 u64 final_start, final_end;
42 u64 common_start, common_end;
43
44 if (!range[i].end)
45 continue;
46
47 common_start = max(range[i].start, start);
48 common_end = min(range[i].end, end);
49 if (common_start > common_end)
50 continue;
51
52 final_start = min(range[i].start, start);
53 final_end = max(range[i].end, end);
54
55 range[i].start = final_start;
56 range[i].end = final_end;
57 return nr_range;
58 }
59
60 /* Need to add it: */
61 return add_range(range, az, nr_range, start, end);
62}
63
64void subtract_range(struct range *range, int az, u64 start, u64 end)
65{
66 int i, j;
67
68 if (start >= end)
69 return;
70
71 for (j = 0; j < az; j++) {
72 if (!range[j].end)
73 continue;
74
75 if (start <= range[j].start && end >= range[j].end) {
76 range[j].start = 0;
77 range[j].end = 0;
78 continue;
79 }
80
81 if (start <= range[j].start && end < range[j].end &&
82 range[j].start < end) {
83 range[j].start = end;
84 continue;
85 }
86
87
88 if (start > range[j].start && end >= range[j].end &&
89 range[j].end > start) {
90 range[j].end = start;
91 continue;
92 }
93
94 if (start > range[j].start && end < range[j].end) {
95 /* Find the new spare: */
96 for (i = 0; i < az; i++) {
97 if (range[i].end == 0)
98 break;
99 }
100 if (i < az) {
101 range[i].end = range[j].end;
102 range[i].start = end;
103 } else {
104 printk(KERN_ERR "run of slot in ranges\n");
105 }
106 range[j].end = start;
107 continue;
108 }
109 }
110}
111
112static int cmp_range(const void *x1, const void *x2)
113{
114 const struct range *r1 = x1;
115 const struct range *r2 = x2;
116 s64 start1, start2;
117
118 start1 = r1->start;
119 start2 = r2->start;
120
121 return start1 - start2;
122}
123
124int clean_sort_range(struct range *range, int az)
125{
126 int i, j, k = az - 1, nr_range = 0;
127
128 for (i = 0; i < k; i++) {
129 if (range[i].end)
130 continue;
131 for (j = k; j > i; j--) {
132 if (range[j].end) {
133 k = j;
134 break;
135 }
136 }
137 if (j == i)
138 break;
139 range[i].start = range[k].start;
140 range[i].end = range[k].end;
141 range[k].start = 0;
142 range[k].end = 0;
143 k--;
144 }
145 /* count it */
146 for (i = 0; i < az; i++) {
147 if (!range[i].end) {
148 nr_range = i;
149 break;
150 }
151 }
152
153 /* sort them */
154 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
155
156 return nr_range;
157}
158
159void sort_range(struct range *range, int nr_range)
160{
161 /* sort them */
162 sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
163}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,54 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/hardirq.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66#ifdef CONFIG_DEBUG_LOCK_ALLOC
67
68int debug_lockdep_rcu_enabled(void)
69{
70 return rcu_scheduler_active && debug_locks &&
71 current->lockdep_recursion == 0;
72}
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74
75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
77 *
78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation.
82 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */
85int rcu_read_lock_bh_held(void)
86{
87 if (!debug_lockdep_rcu_enabled())
88 return 1;
89 return in_softirq();
90}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92
93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
94
55/* 95/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 97 * grace period has elapsed.
@@ -63,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
63 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
64 complete(&rcu->completion); 104 complete(&rcu->completion);
65} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9bb52177af02..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -450,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
450{ 464{
451 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
452 466
467 init_rcu_head_on_stack(&rcu.head);
453 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
454 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
455 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
456} 472}
457 473
458static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -465,6 +481,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 481 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 482 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 483 .cb_barrier = rcu_barrier_bh,
484 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 485 .stats = NULL,
469 .irq_capable = 1, 486 .irq_capable = 1,
470 .name = "rcu_bh" 487 .name = "rcu_bh"
@@ -480,6 +497,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 497 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 498 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 499 .cb_barrier = NULL,
500 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 501 .stats = NULL,
484 .irq_capable = 1, 502 .irq_capable = 1,
485 .name = "rcu_bh_sync" 503 .name = "rcu_bh_sync"
@@ -621,6 +639,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 639 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 640 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 641 .cb_barrier = rcu_barrier_sched,
642 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 643 .stats = NULL,
625 .irq_capable = 1, 644 .irq_capable = 1,
626 .name = "sched" 645 .name = "sched"
@@ -636,6 +655,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 655 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 656 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 657 .cb_barrier = NULL,
658 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 659 .stats = NULL,
640 .name = "sched_sync" 660 .name = "sched_sync"
641}; 661};
@@ -650,12 +670,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 670 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 672 .cb_barrier = NULL,
653 .stats = rcu_expedited_torture_stats, 673 .fqs = rcu_sched_force_quiescent_state,
674 .stats = NULL,
654 .irq_capable = 1, 675 .irq_capable = 1,
655 .name = "sched_expedited" 676 .name = "sched_expedited"
656}; 677};
657 678
658/* 679/*
680 * RCU torture force-quiescent-state kthread. Repeatedly induces
681 * bursts of calls to force_quiescent_state(), increasing the probability
682 * of occurrence of some important types of race conditions.
683 */
684static int
685rcu_torture_fqs(void *arg)
686{
687 unsigned long fqs_resume_time;
688 int fqs_burst_remaining;
689
690 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
691 do {
692 fqs_resume_time = jiffies + fqs_stutter * HZ;
693 while (jiffies - fqs_resume_time > LONG_MAX) {
694 schedule_timeout_interruptible(1);
695 }
696 fqs_burst_remaining = fqs_duration;
697 while (fqs_burst_remaining > 0) {
698 cur_ops->fqs();
699 udelay(fqs_holdoff);
700 fqs_burst_remaining -= fqs_holdoff;
701 }
702 rcu_stutter_wait("rcu_torture_fqs");
703 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
704 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
705 rcutorture_shutdown_absorb("rcu_torture_fqs");
706 while (!kthread_should_stop())
707 schedule_timeout_uninterruptible(1);
708 return 0;
709}
710
711/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 712 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 713 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 714 * after a series of grace periods (the "pipeline").
@@ -745,7 +798,11 @@ static void rcu_torture_timer(unsigned long unused)
745 798
746 idx = cur_ops->readlock(); 799 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 800 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 801 p = rcu_dereference_check(rcu_torture_current,
802 rcu_read_lock_held() ||
803 rcu_read_lock_bh_held() ||
804 rcu_read_lock_sched_held() ||
805 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 806 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 807 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 808 cur_ops->readunlock(idx);
@@ -763,13 +820,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 820 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 821 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 822 }
766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 823 __this_cpu_inc(rcu_torture_count[pipe_count]);
767 completed = cur_ops->completed() - completed; 824 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 825 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 826 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 827 completed = RCU_TORTURE_PIPE_LEN;
771 } 828 }
772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 829 __this_cpu_inc(rcu_torture_batch[completed]);
773 preempt_enable(); 830 preempt_enable();
774 cur_ops->readunlock(idx); 831 cur_ops->readunlock(idx);
775} 832}
@@ -798,11 +855,15 @@ rcu_torture_reader(void *arg)
798 do { 855 do {
799 if (irqreader && cur_ops->irq_capable) { 856 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 857 if (!timer_pending(&t))
801 mod_timer(&t, 1); 858 mod_timer(&t, jiffies + 1);
802 } 859 }
803 idx = cur_ops->readlock(); 860 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 861 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 862 p = rcu_dereference_check(rcu_torture_current,
863 rcu_read_lock_held() ||
864 rcu_read_lock_bh_held() ||
865 rcu_read_lock_sched_held() ||
866 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 867 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 868 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 869 cur_ops->readunlock(idx);
@@ -818,13 +879,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 879 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 880 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 881 }
821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); 882 __this_cpu_inc(rcu_torture_count[pipe_count]);
822 completed = cur_ops->completed() - completed; 883 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 884 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 885 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 886 completed = RCU_TORTURE_PIPE_LEN;
826 } 887 }
827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); 888 __this_cpu_inc(rcu_torture_batch[completed]);
828 preempt_enable(); 889 preempt_enable();
829 cur_ops->readunlock(idx); 890 cur_ops->readunlock(idx);
830 schedule(); 891 schedule();
@@ -1030,10 +1091,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1091 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1092 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1093 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1094 "shuffle_interval=%d stutter=%d irqreader=%d "
1095 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1096 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1097 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1098 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1099}
1038 1100
1039static struct notifier_block rcutorture_nb = { 1101static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1171,12 @@ rcu_torture_cleanup(void)
1109 } 1171 }
1110 stats_task = NULL; 1172 stats_task = NULL;
1111 1173
1174 if (fqs_task) {
1175 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1176 kthread_stop(fqs_task);
1177 }
1178 fqs_task = NULL;
1179
1112 /* Wait for all RCU callbacks to fire. */ 1180 /* Wait for all RCU callbacks to fire. */
1113 1181
1114 if (cur_ops->cb_barrier != NULL) 1182 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1222,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1222 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1223 return -EINVAL;
1156 } 1224 }
1225 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1226 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1227 "fqs_duration, fqs disabled.\n");
1228 fqs_duration = 0;
1229 }
1157 if (cur_ops->init) 1230 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1231 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1232
@@ -1282,6 +1355,19 @@ rcu_torture_init(void)
1282 goto unwind; 1355 goto unwind;
1283 } 1356 }
1284 } 1357 }
1358 if (fqs_duration < 0)
1359 fqs_duration = 0;
1360 if (fqs_duration) {
1361 /* Create the stutter thread */
1362 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1363 "rcu_torture_fqs");
1364 if (IS_ERR(fqs_task)) {
1365 firsterr = PTR_ERR(fqs_task);
1366 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1367 fqs_task = NULL;
1368 goto unwind;
1369 }
1370 }
1285 register_reboot_notifier(&rcutorture_nb); 1371 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1372 mutex_unlock(&fullstop_mutex);
1287 return 0; 1373 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -54,8 +54,8 @@
54 54
55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
56 56
57#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
58 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
59 .levelcnt = { \ 59 .levelcnt = { \
60 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
61 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -66,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 67 .gpnum = -300, \
68 .completed = -300, \ 68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
70 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
74 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
76} 77}
77 78
78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -81,8 +82,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 84
84static int rcu_scheduler_active __read_mostly; 85int rcu_scheduler_active __read_mostly;
85 86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
86 87
87/* 88/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
@@ -101,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
101 */ 102 */
102void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
103{ 104{
104 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
105 106
106 rdp = &per_cpu(rcu_sched_data, cpu);
107 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
108 barrier(); 108 barrier();
109 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
110 rcu_preempt_note_context_switch(cpu);
111} 110}
112 111
113void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
114{ 113{
115 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
116 115
117 rdp = &per_cpu(rcu_bh_data, cpu);
118 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
119 barrier(); 117 barrier();
120 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
121} 119}
122 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
123#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
124DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
125 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -157,6 +165,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 165EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 166
159/* 167/*
168 * Force a quiescent state for RCU BH.
169 */
170void rcu_bh_force_quiescent_state(void)
171{
172 force_quiescent_state(&rcu_bh_state, 0);
173}
174EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
175
176/*
177 * Force a quiescent state for RCU-sched.
178 */
179void rcu_sched_force_quiescent_state(void)
180{
181 force_quiescent_state(&rcu_sched_state, 0);
182}
183EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
184
185/*
160 * Does the CPU have callbacks ready to be invoked? 186 * Does the CPU have callbacks ready to be invoked?
161 */ 187 */
162static int 188static int
@@ -424,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
424 450
425#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
426 452
453int rcu_cpu_stall_panicking __read_mostly;
454
427static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
428{ 456{
429 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -439,10 +467,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 467
440 /* Only let one CPU complain about others per time interval. */ 468 /* Only let one CPU complain about others per time interval. */
441 469
442 spin_lock_irqsave(&rnp->lock, flags); 470 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 471 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 472 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 473 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 474 return;
447 } 475 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 476 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,23 +480,30 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 480 * due to CPU offlining.
453 */ 481 */
454 rcu_print_task_stall(rnp); 482 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 483 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 484
457 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
458 486
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
460 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
490 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
492 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 493 if (rnp->qsmask == 0)
463 continue; 494 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 495 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
465 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
466 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
467 } 498 }
468 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
471 502
503 /* If so configured, complain about tasks blocking the grace period. */
504
505 rcu_print_detail_task_stall(rsp);
506
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 507 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 508}
474 509
@@ -477,15 +512,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
477 unsigned long flags; 512 unsigned long flags;
478 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
479 514
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
483 518
484 spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 520 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 521 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 522 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 523 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 524
490 set_need_resched(); /* kick ourselves to get things going. */ 525 set_need_resched(); /* kick ourselves to get things going. */
491} 526}
@@ -495,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495 long delta; 530 long delta;
496 struct rcu_node *rnp; 531 struct rcu_node *rnp;
497 532
533 if (rcu_cpu_stall_panicking)
534 return;
498 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
499 rnp = rdp->mynode; 536 rnp = rdp->mynode;
500 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -509,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
509 } 546 }
510} 547}
511 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
512#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
513 565
514static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -519,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
519{ 571{
520} 572}
521 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
522#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
523 579
524/* 580/*
@@ -545,12 +601,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 601 local_irq_save(flags);
546 rnp = rdp->mynode; 602 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 603 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 604 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 605 local_irq_restore(flags);
550 return; 606 return;
551 } 607 }
552 __note_new_gpnum(rsp, rnp, rdp); 608 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 609 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 610}
555 611
556/* 612/*
@@ -609,12 +665,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 665 local_irq_save(flags);
610 rnp = rdp->mynode; 666 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 667 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 668 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 669 local_irq_restore(flags);
614 return; 670 return;
615 } 671 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 672 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 673 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 674}
619 675
620/* 676/*
@@ -659,12 +715,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 715 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 716 struct rcu_node *rnp = rcu_get_root(rsp);
661 717
662 if (!cpu_needs_another_gp(rsp, rdp)) { 718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
719 if (cpu_needs_another_gp(rsp, rdp))
720 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 721 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 722 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 723 return;
666 } 724 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 725 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 726
669 /* 727 /*
670 * Propagate new ->completed value to rcu_node structures 728 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +730,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 730 * of the next grace period to process their callbacks.
673 */ 731 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 732 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 733 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 734 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 735 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 736 }
679 local_irq_restore(flags); 737 local_irq_restore(flags);
680 return; 738 return;
@@ -695,15 +753,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 753 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 754 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 755 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 756 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 757 return;
700 } 758 }
701 759
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 760 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 761
704 762
705 /* Exclude any concurrent CPU-hotplug operations. */ 763 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 764 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 765
708 /* 766 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 767 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +781,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 781 * irqs disabled.
724 */ 782 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 783 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 784 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 785 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 786 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 787 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 788 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 789 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 790 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 791 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 792 }
735 793
736 rnp = rcu_get_root(rsp); 794 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 795 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 796 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 797 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 798 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 799}
742 800
743/* 801/*
@@ -776,14 +834,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 834 if (!(rnp->qsmask & mask)) {
777 835
778 /* Our bit has already been cleared, so done. */ 836 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 837 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 838 return;
781 } 839 }
782 rnp->qsmask &= ~mask; 840 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 841 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 842
785 /* Other bits still set at this level, so done. */ 843 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 844 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 845 return;
788 } 846 }
789 mask = rnp->grpmask; 847 mask = rnp->grpmask;
@@ -793,10 +851,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 851
794 break; 852 break;
795 } 853 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 854 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 855 rnp_c = rnp;
798 rnp = rnp->parent; 856 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 857 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 858 WARN_ON_ONCE(rnp_c->qsmask);
801 } 859 }
802 860
@@ -825,7 +883,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 883 struct rcu_node *rnp;
826 884
827 rnp = rdp->mynode; 885 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 886 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 887 if (lastcomp != rnp->completed) {
830 888
831 /* 889 /*
@@ -837,12 +895,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 895 * race occurred.
838 */ 896 */
839 rdp->passed_quiesc = 0; /* try again later! */ 897 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 898 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 899 return;
842 } 900 }
843 mask = rdp->grpmask; 901 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 902 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 903 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 904 } else {
847 rdp->qs_pending = 0; 905 rdp->qs_pending = 0;
848 906
@@ -906,7 +964,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 964
907 if (rdp->nxtlist == NULL) 965 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 966 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 967 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 968 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 969 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 970 rdp->nxtlist = NULL;
@@ -914,7 +972,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 972 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 973 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 974 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 976}
919 977
920/* 978/*
@@ -925,10 +983,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 983 unsigned long flags;
926 struct rcu_data *rdp; 984 struct rcu_data *rdp;
927 985
928 spin_lock_irqsave(&rsp->onofflock, flags); 986 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 987 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 988 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 990 return;
933 } 991 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +995,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 995 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 997 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 998 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 999}
942 1000
943/* 1001/*
@@ -953,23 +1011,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 1011 struct rcu_node *rnp;
954 1012
955 /* Exclude any attempts to start a new grace period. */ 1013 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 1014 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 1015
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1016 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 1017 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1018 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 1019 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 1020 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 1021 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 1022 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 1023 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1024 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 1025 break;
968 } 1026 }
969 if (rnp == rdp->mynode) 1027 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1028 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 1029 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1030 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 1031 mask = rnp->grpmask;
974 rnp = rnp->parent; 1032 rnp = rnp->parent;
975 } while (rnp != NULL); 1033 } while (rnp != NULL);
@@ -980,12 +1038,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1038 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1039 * held leads to deadlock.
982 */ 1040 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1042 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1043 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1044 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1045 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1046 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1047 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1048 rcu_report_exp_rnp(rsp, rnp);
991 1049
@@ -1103,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1103 */ 1161 */
1104void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1105{ 1163{
1106 if (!rcu_pending(cpu))
1107 return; /* if nothing for RCU to do. */
1108 if (user || 1164 if (user ||
1109 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1110 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1136,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1136 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1137 } 1193 }
1138 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1139 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1140} 1197}
1141 1198
1142#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1144,11 +1201,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1201/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1202 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1203 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1204 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1205 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1206static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1207{
1153 unsigned long bit; 1208 unsigned long bit;
1154 int cpu; 1209 int cpu;
@@ -1158,13 +1213,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1213
1159 rcu_for_each_leaf_node(rsp, rnp) { 1214 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1215 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1216 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1217 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1218 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1219 return;
1165 } 1220 }
1166 if (rnp->qsmask == 0) { 1221 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1222 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1223 continue;
1169 } 1224 }
1170 cpu = rnp->grplo; 1225 cpu = rnp->grplo;
@@ -1173,15 +1228,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1228 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1229 mask |= bit;
1175 } 1230 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1231 if (mask != 0) {
1177 1232
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1233 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1234 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1235 continue;
1181 } 1236 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1237 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1238 }
1184 return 0;
1185} 1239}
1186 1240
1187/* 1241/*
@@ -1191,78 +1245,65 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1245static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1246{
1193 unsigned long flags; 1247 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1248 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1249
1199 if (!rcu_gp_in_progress(rsp)) 1250 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1251 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1252 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1253 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1254 return; /* Someone else is already on the job. */
1204 } 1255 }
1205 if (relaxed && 1256 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1257 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1258 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1260 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1261 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1262 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1264 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1265 }
1218 spin_unlock(&rnp->lock); 1266 rsp->fqs_active = 1;
1219 switch (signaled) { 1267 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1268 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1269 case RCU_GP_INIT:
1222 1270
1223 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1224 1272
1225 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1226
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1229 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1230 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1282 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1283 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1284 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1285
1253 case RCU_FORCE_QS: 1286 case RCU_FORCE_QS:
1254 1287
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1288 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1289 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1290 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1291
1260 /* Leave state in case more forcing is required. */ 1292 /* Leave state in case more forcing is required. */
1261 1293
1294 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1295 break;
1263 } 1296 }
1264unlock_ret: 1297 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1298 if (rsp->fqs_need_gp) {
1299 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1300 rsp->fqs_need_gp = 0;
1301 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1302 return;
1303 }
1304 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1305unlock_fqs_ret:
1306 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1307}
1267 1308
1268#else /* #ifdef CONFIG_SMP */ 1309#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1331,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1331 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1332 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1333 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1334 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1335 force_quiescent_state(rsp, 1);
1295 1336
1296 /* 1337 /*
@@ -1304,7 +1345,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1345
1305 /* Does this CPU require a not-yet-started grace period? */ 1346 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1347 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1348 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1349 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1350 }
1310 1351
@@ -1335,6 +1376,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1376 * grace-period manipulations above.
1336 */ 1377 */
1337 smp_mb(); /* See above block comment. */ 1378 smp_mb(); /* See above block comment. */
1379
1380 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1381 rcu_needs_cpu_flush();
1338} 1382}
1339 1383
1340static void 1384static void
@@ -1369,7 +1413,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1413 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1414 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1415
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1416 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1417 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1418 }
1375 1419
@@ -1387,7 +1431,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1431 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1432 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1433 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1434 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1435 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1436 local_irq_restore(flags);
1393} 1437}
@@ -1440,11 +1484,13 @@ void synchronize_sched(void)
1440 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1441 return; 1485 return;
1442 1486
1487 init_rcu_head_on_stack(&rcu.head);
1443 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1444 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1445 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1446 /* Wait for it. */ 1491 /* Wait for it. */
1447 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1448} 1494}
1449EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1450 1496
@@ -1464,11 +1510,13 @@ void synchronize_rcu_bh(void)
1464 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1465 return; 1511 return;
1466 1512
1513 init_rcu_head_on_stack(&rcu.head);
1467 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1468 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1469 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1470 /* Wait for it. */ 1517 /* Wait for it. */
1471 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1472} 1520}
1473EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1474 1522
@@ -1489,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1489 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1490 1538
1491 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1492 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1493 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1494 return 1; 1554 return 1;
1495 } 1555 }
1496 1556
@@ -1520,7 +1580,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1580
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1581 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1582 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1583 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1584 rdp->n_rp_need_fqs++;
1525 return 1; 1585 return 1;
1526 } 1586 }
@@ -1545,10 +1605,9 @@ static int rcu_pending(int cpu)
1545/* 1605/*
1546 * Check to see if any future RCU-related work will need to be done 1606 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1607 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1608 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1609 */
1551int rcu_needs_cpu(int cpu) 1610static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1611{
1553 /* RCU callbacks either ready or pending? */ 1612 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1613 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1615,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1615 rcu_preempt_needs_cpu(cpu);
1557} 1616}
1558 1617
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1618static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1619static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1620static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1703,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1703 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1704
1661 /* Set up local state, ensuring consistent view of global state. */ 1705 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1706 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1707 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1708 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1709 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1713,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1713 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1714#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1715 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1716 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1717}
1674 1718
1675/* 1719/*
@@ -1687,7 +1731,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1731 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1732
1689 /* Set up local state, ensuring consistent view of global state. */ 1733 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1734 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1735 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1736 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1737 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1739,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1739 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1740 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1741 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1742 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1743
1700 /* 1744 /*
1701 * A new grace period might start here. If so, we won't be part 1745 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1747,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1747 */
1704 1748
1705 /* Exclude any attempts to start a new GP on large systems. */ 1749 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1750 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1751
1708 /* Add CPU to rcu_node bitmasks. */ 1752 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1753 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1754 mask = rdp->grpmask;
1711 do { 1755 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1756 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1757 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1758 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1759 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1760 if (rnp == rdp->mynode) {
@@ -1718,11 +1762,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1762 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1763 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1764 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1765 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1766 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1767 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1768
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1769 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1770}
1727 1771
1728static void __cpuinit rcu_online_cpu(int cpu) 1772static void __cpuinit rcu_online_cpu(int cpu)
@@ -1774,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1774} 1818}
1775 1819
1776/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1777 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1778 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1779 */ 1838 */
@@ -1806,11 +1865,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1865 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1866static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1867{
1868 static char *buf[] = { "rcu_node_level_0",
1869 "rcu_node_level_1",
1870 "rcu_node_level_2",
1871 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1872 int cpustride = 1;
1810 int i; 1873 int i;
1811 int j; 1874 int j;
1812 struct rcu_node *rnp; 1875 struct rcu_node *rnp;
1813 1876
1877 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1878
1814 /* Initialize the level-tracking arrays. */ 1879 /* Initialize the level-tracking arrays. */
1815 1880
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1881 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1888,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1888 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1889 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1890 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1891 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1892 lockdep_set_class_and_name(&rnp->lock,
1893 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1894 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1895 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1896 rnp->qsmaskinit = 0;
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,32 +1933,18 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
1878{ 1944{
1879 int i; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1896,8 +1956,9 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1956 * or the scheduler are operational.
1897 */ 1957 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,15 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_report_qs;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cb_ready;
228 long n_rp_gp_completed; 228 unsigned long n_rp_cpu_needs_gp;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_completed;
230 long n_rp_need_fqs; 230 unsigned long n_rp_gp_started;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_fqs;
232 unsigned long n_rp_need_nothing;
232 233
233 int cpu; 234 int cpu;
234}; 235};
@@ -237,25 +238,36 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 238#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 239#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 240#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 241#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 242#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 244#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 245#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 246#endif /* #else #ifdef CONFIG_NO_HZ */
247 247
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 249#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
250#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ 250
251#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ 251#ifdef CONFIG_PROVE_RCU
252#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 252#define RCU_STALL_DELAY_DELTA (5 * HZ)
253 /* to take at least one */ 253#else
254 /* scheduling clock irq */ 254#define RCU_STALL_DELAY_DELTA 0
255 /* before ratting on them. */ 255#endif
256
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
260 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */
263 /* scheduling clock irq */
264 /* before ratting on them. */
256 265
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 267
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270
259/* 271/*
260 * RCU global state, including node hierarchy. This hierarchy is 272 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 273 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +289,19 @@ struct rcu_state {
277 289
278 u8 signaled ____cacheline_internodealigned_in_smp; 290 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 291 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 292 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 293 /* is running. */
294 u8 fqs_need_gp; /* A CPU was prevented from */
295 /* starting a new grace */
296 /* period because */
297 /* force_quiescent_state() */
298 /* was running. */
299 unsigned long gpnum; /* Current gp number. */
300 unsigned long completed; /* # of last completed gp. */
282 301
283 /* End of fields guarded by root rcu_node's lock. */ 302 /* End of fields guarded by root rcu_node's lock. */
284 303
285 spinlock_t onofflock; /* exclude on/offline and */ 304 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 305 /* starting new GP. Also */
287 /* protects the following */ 306 /* protects the following */
288 /* orphan_cbs fields. */ 307 /* orphan_cbs fields. */
@@ -292,10 +311,8 @@ struct rcu_state {
292 /* going offline. */ 311 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 312 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 313 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 314 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 315 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 316 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 317 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 318 unsigned long n_force_qs; /* Number of calls to */
@@ -310,6 +327,7 @@ struct rcu_state {
310 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
311 /* for CPU stalls. */ 328 /* for CPU stalls. */
312#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
313}; 331};
314 332
315/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
@@ -319,8 +337,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 337#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 338 /* GP were moved to root. */
321 339
322#ifdef RCU_TREE_NONCORE
323
324/* 340/*
325 * RCU implementation internal declarations: 341 * RCU implementation internal declarations:
326 */ 342 */
@@ -335,7 +351,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 351DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 352#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 353
338#else /* #ifdef RCU_TREE_NONCORE */ 354#ifndef RCU_TREE_NONCORE
339 355
340/* Forward declarations for rcutree_plugin.h */ 356/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 357static void rcu_bootup_announce(void);
@@ -347,6 +363,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 363 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 364#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 367static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +384,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 384static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 385static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 386static void __init __rcu_init_preempt(void);
387static void rcu_needs_cpu_flush(void);
370 388
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 389#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -62,17 +101,32 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 101EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 102
64/* 103/*
104 * Force a quiescent state for preemptible RCU.
105 */
106void rcu_force_quiescent_state(void)
107{
108 force_quiescent_state(&rcu_preempt_state, 0);
109}
110EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
111
112/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 113 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
68 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
69 */ 121 */
70static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
71{ 123{
72 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
73 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
74 barrier(); 127 barrier();
75 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
76} 130}
77 131
78/* 132/*
@@ -102,7 +156,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 158 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 161 t->rcu_blocked_node = rnp;
108 162
@@ -123,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 177 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 178 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 179 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 180 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 181 }
128 182
129 /* 183 /*
@@ -135,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
135 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
136 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
137 */ 191 */
138 rcu_preempt_qs(cpu);
139 local_irq_save(flags); 192 local_irq_save(flags);
140 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
141 local_irq_restore(flags); 194 local_irq_restore(flags);
142} 195}
143 196
@@ -180,7 +233,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 233 struct rcu_node *rnp_p;
181 234
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 235 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 236 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 237 return; /* Still need more quiescent states! */
185 } 238 }
186 239
@@ -197,8 +250,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 250
198 /* Report up the rest of the hierarchy. */ 251 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 252 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 253 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 254 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 255 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 256}
204 257
@@ -227,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
227 */ 280 */
228 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
229 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
230 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
231 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
232 } 284 }
233 285
@@ -248,10 +300,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 300 */
249 for (;;) { 301 for (;;) {
250 rnp = t->rcu_blocked_node; 302 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 303 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 304 if (rnp == t->rcu_blocked_node)
253 break; 305 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 306 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 307 }
256 empty = !rcu_preempted_readers(rnp); 308 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 309 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +317,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 317 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 318 */
267 if (empty) 319 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 320 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 321 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 322 rcu_report_unblock_qs_rnp(rnp, flags);
271 323
@@ -295,29 +347,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 349 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING
351 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
352#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 353}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 354EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 355
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 356#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 357
358#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
359
360/*
361 * Dump detailed information for all tasks blocking the current RCU
362 * grace period on the specified rcu_node structure.
363 */
364static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
365{
366 unsigned long flags;
367 struct list_head *lp;
368 int phase;
369 struct task_struct *t;
370
371 if (rcu_preempted_readers(rnp)) {
372 raw_spin_lock_irqsave(&rnp->lock, flags);
373 phase = rnp->gpnum & 0x1;
374 lp = &rnp->blocked_tasks[phase];
375 list_for_each_entry(t, lp, rcu_node_entry)
376 sched_show_task(t);
377 raw_spin_unlock_irqrestore(&rnp->lock, flags);
378 }
379}
380
381/*
382 * Dump detailed information for all tasks blocking the current RCU
383 * grace period.
384 */
385static void rcu_print_detail_task_stall(struct rcu_state *rsp)
386{
387 struct rcu_node *rnp = rcu_get_root(rsp);
388
389 rcu_print_detail_task_stall_rnp(rnp);
390 rcu_for_each_leaf_node(rsp, rnp)
391 rcu_print_detail_task_stall_rnp(rnp);
392}
393
394#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
395
396static void rcu_print_detail_task_stall(struct rcu_state *rsp)
397{
398}
399
400#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
401
303/* 402/*
304 * Scan the current list of tasks blocked within RCU read-side critical 403 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 404 * sections, printing out the tid of each.
306 */ 405 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 406static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 407{
309 unsigned long flags;
310 struct list_head *lp; 408 struct list_head *lp;
311 int phase; 409 int phase;
312 struct task_struct *t; 410 struct task_struct *t;
313 411
314 if (rcu_preempted_readers(rnp)) { 412 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 413 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 414 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 415 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 416 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 417 }
322} 418}
323 419
@@ -388,11 +484,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 484 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 485 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 486 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 487 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 488 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 489 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 490 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 491 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 492 }
397 } 493 }
398 return retval; 494 return retval;
@@ -420,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
420 struct task_struct *t = current; 516 struct task_struct *t = current;
421 517
422 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
423 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
424 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
425 return; 520 return;
426 } 521 }
@@ -462,11 +557,13 @@ void synchronize_rcu(void)
462 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
463 return; 558 return;
464 559
560 init_rcu_head_on_stack(&rcu.head);
465 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
466 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
467 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
468 /* Wait for it. */ 564 /* Wait for it. */
469 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
470} 567}
471EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
472 569
@@ -516,7 +613,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 613 unsigned long flags;
517 unsigned long mask; 614 unsigned long mask;
518 615
519 spin_lock_irqsave(&rnp->lock, flags); 616 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 617 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 618 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 619 break;
@@ -525,12 +622,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 622 break;
526 } 623 }
527 mask = rnp->grpmask; 624 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 625 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 626 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 627 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 628 rnp->expmask &= ~mask;
532 } 629 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 630 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 631}
535 632
536/* 633/*
@@ -545,11 +642,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 642{
546 int must_wait; 643 int must_wait;
547 644
548 spin_lock(&rnp->lock); /* irqs already disabled */ 645 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 646 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 647 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 648 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 649 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 650 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 651 rcu_report_exp_rnp(rsp, rnp);
555} 652}
@@ -594,13 +691,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 691 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 692 synchronize_sched_expedited();
596 693
597 spin_lock_irqsave(&rsp->onofflock, flags); 694 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 695
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 696 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 697 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 698 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 699 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 700 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 701 }
605 702
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 703 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +706,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 706 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 707 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 708
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 709 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 710
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 711 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 712 rnp = rcu_get_root(rsp);
@@ -701,6 +798,7 @@ void exit_rcu(void)
701static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
702{ 799{
703 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
704} 802}
705 803
706/* 804/*
@@ -713,6 +811,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 811EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 812
715/* 813/*
814 * Force a quiescent state for RCU, which, because there is no preemptible
815 * RCU, becomes the same as rcu-sched.
816 */
817void rcu_force_quiescent_state(void)
818{
819 rcu_sched_force_quiescent_state();
820}
821EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
822
823/*
716 * Because preemptable RCU does not exist, we never have to check for 824 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 825 * CPUs being in quiescent states.
718 */ 826 */
@@ -734,7 +842,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 842/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 843static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 844{
737 spin_unlock_irqrestore(&rnp->lock, flags); 845 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 846}
739 847
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 848#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +853,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 853 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 854 * tasks blocked within RCU read-side critical sections.
747 */ 855 */
856static void rcu_print_detail_task_stall(struct rcu_state *rsp)
857{
858}
859
860/*
861 * Because preemptable RCU does not exist, we never have to check for
862 * tasks blocked within RCU read-side critical sections.
863 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 864static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 865{
750} 866}
@@ -884,3 +1000,123 @@ static void __init __rcu_init_preempt(void)
884} 1000}
885 1001
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1002#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1003
1004#if !defined(CONFIG_RCU_FAST_NO_HZ)
1005
1006/*
1007 * Check to see if any future RCU-related work will need to be done
1008 * by the current CPU, even if none need be done immediately, returning
1009 * 1 if so. This function is part of the RCU implementation; it is -not-
1010 * an exported member of the RCU API.
1011 *
1012 * Because we have preemptible RCU, just check whether this CPU needs
1013 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
1014 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1015 */
1016int rcu_needs_cpu(int cpu)
1017{
1018 return rcu_needs_cpu_quick_check(cpu);
1019}
1020
1021/*
1022 * Check to see if we need to continue a callback-flush operations to
1023 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
1024 * entry is not configured, so we never do need to.
1025 */
1026static void rcu_needs_cpu_flush(void)
1027{
1028}
1029
1030#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1031
1032#define RCU_NEEDS_CPU_FLUSHES 5
1033static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1034static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1035
1036/*
1037 * Check to see if any future RCU-related work will need to be done
1038 * by the current CPU, even if none need be done immediately, returning
1039 * 1 if so. This function is part of the RCU implementation; it is -not-
1040 * an exported member of the RCU API.
1041 *
1042 * Because we are not supporting preemptible RCU, attempt to accelerate
1043 * any current grace periods so that RCU no longer needs this CPU, but
1044 * only if all other CPUs are already in dynticks-idle mode. This will
1045 * allow the CPU cores to be powered down immediately, as opposed to after
1046 * waiting many milliseconds for grace periods to elapse.
1047 *
1048 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1049 * disabled, we do one pass of force_quiescent_state(), then do a
1050 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1051 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1052 */
1053int rcu_needs_cpu(int cpu)
1054{
1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1058 int thatcpu;
1059
1060 /* Check for being in the holdoff period. */
1061 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
1062 return rcu_needs_cpu_quick_check(cpu);
1063
1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1065 for_each_online_cpu(thatcpu) {
1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1074 return rcu_needs_cpu_quick_check(cpu);
1075 }
1076 }
1077
1078 /* Check and update the rcu_dyntick_drain sequencing. */
1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1080 /* First time through, initialize the counter. */
1081 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1082 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1083 /* We have hit the limit, so time to give up. */
1084 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1085 return rcu_needs_cpu_quick_check(cpu);
1086 }
1087
1088 /* Do one step pushing remaining RCU callbacks through. */
1089 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1090 rcu_sched_qs(cpu);
1091 force_quiescent_state(&rcu_sched_state, 0);
1092 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1093 }
1094 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1095 rcu_bh_qs(cpu);
1096 force_quiescent_state(&rcu_bh_state, 0);
1097 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1098 }
1099
1100 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1101 if (c)
1102 raise_softirq(RCU_SOFTIRQ);
1103 return c;
1104}
1105
1106/*
1107 * Check to see if we need to continue a callback-flush operations to
1108 * allow the last CPU to enter dyntick-idle mode.
1109 */
1110static void rcu_needs_cpu_flush(void)
1111{
1112 int cpu = smp_processor_id();
1113 unsigned long flags;
1114
1115 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1116 return;
1117 local_irq_save(flags);
1118 (void)rcu_needs_cpu(cpu);
1119 local_irq_restore(flags);
1120}
1121
1122#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
539 "relay_hotcpu_callback: cpu %d buffer " 539 "relay_hotcpu_callback: cpu %d buffer "
540 "creation failed\n", hotcpu); 540 "creation failed\n", hotcpu);
541 mutex_unlock(&relay_channels_mutex); 541 mutex_unlock(&relay_channels_mutex);
542 return NOTIFY_BAD; 542 return notifier_from_errno(-ENOMEM);
543 } 543 }
544 } 544 }
545 mutex_unlock(&relay_channels_mutex); 545 mutex_unlock(&relay_channels_mutex);
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
1215/* 1215/*
1216 * subbuf_splice_actor - splice up to one subbuf's worth of data 1216 * subbuf_splice_actor - splice up to one subbuf's worth of data
1217 */ 1217 */
1218static int subbuf_splice_actor(struct file *in, 1218static ssize_t subbuf_splice_actor(struct file *in,
1219 loff_t *ppos, 1219 loff_t *ppos,
1220 struct pipe_inode_info *pipe, 1220 struct pipe_inode_info *pipe,
1221 size_t len, 1221 size_t len,
1222 unsigned int flags, 1222 unsigned int flags,
1223 int *nonpad_ret) 1223 int *nonpad_ret)
1224{ 1224{
1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; 1225 unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
1226 struct rchan_buf *rbuf = in->private_data; 1226 struct rchan_buf *rbuf = in->private_data;
1227 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1227 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1228 uint64_t pos = (uint64_t) *ppos; 1228 uint64_t pos = (uint64_t) *ppos;
@@ -1231,8 +1231,8 @@ static int subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1241,9 +1241,12 @@ static int subbuf_splice_actor(struct file *in,
1241 .ops = &relay_pipe_buf_ops, 1241 .ops = &relay_pipe_buf_ops,
1242 .spd_release = relay_page_release, 1242 .spd_release = relay_page_release,
1243 }; 1243 };
1244 ssize_t ret;
1244 1245
1245 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1246 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1247 1250
1248 /* 1251 /*
1249 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1254,7 +1257,7 @@ static int subbuf_splice_actor(struct file *in,
1254 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1255 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1256 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1257 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1258 1261
1259 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1260 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1288,16 +1291,19 @@ static int subbuf_splice_actor(struct file *in,
1288 } 1291 }
1289 } 1292 }
1290 1293
1294 ret = 0;
1291 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1292 return 0; 1296 goto out;
1293 1297
1294 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1295 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1296 return ret; 1300 goto out;
1297 1301
1298 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1299 ret += padding; 1303 ret += padding;
1300 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1301 return ret; 1307 return ret;
1302} 1308}
1303 1309
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/res_counter.h> 13#include <linux/res_counter.h>
15#include <linux/uaccess.h> 14#include <linux/uaccess.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/sched.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <linux/device.h> 20#include <linux/device.h>
20#include <linux/pfn.h> 21#include <linux/pfn.h>
@@ -188,20 +189,65 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 189 return -EINVAL;
189} 190}
190 191
192static void __release_child_resources(struct resource *r)
193{
194 struct resource *tmp, *p;
195 resource_size_t size;
196
197 p = r->child;
198 r->child = NULL;
199 while (p) {
200 tmp = p;
201 p = p->sibling;
202
203 tmp->parent = NULL;
204 tmp->sibling = NULL;
205 __release_child_resources(tmp);
206
207 printk(KERN_DEBUG "release child resource %pR\n", tmp);
208 /* need to restore size, and keep flags */
209 size = resource_size(tmp);
210 tmp->start = 0;
211 tmp->end = size - 1;
212 }
213}
214
215void release_child_resources(struct resource *r)
216{
217 write_lock(&resource_lock);
218 __release_child_resources(r);
219 write_unlock(&resource_lock);
220}
221
191/** 222/**
192 * request_resource - request and reserve an I/O or memory resource 223 * request_resource_conflict - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 224 * @root: root resource descriptor
194 * @new: resource descriptor desired by caller 225 * @new: resource descriptor desired by caller
195 * 226 *
196 * Returns 0 for success, negative error code on error. 227 * Returns 0 for success, conflict resource on error.
197 */ 228 */
198int request_resource(struct resource *root, struct resource *new) 229struct resource *request_resource_conflict(struct resource *root, struct resource *new)
199{ 230{
200 struct resource *conflict; 231 struct resource *conflict;
201 232
202 write_lock(&resource_lock); 233 write_lock(&resource_lock);
203 conflict = __request_resource(root, new); 234 conflict = __request_resource(root, new);
204 write_unlock(&resource_lock); 235 write_unlock(&resource_lock);
236 return conflict;
237}
238
239/**
240 * request_resource - request and reserve an I/O or memory resource
241 * @root: root resource descriptor
242 * @new: resource descriptor desired by caller
243 *
244 * Returns 0 for success, negative error code on error.
245 */
246int request_resource(struct resource *root, struct resource *new)
247{
248 struct resource *conflict;
249
250 conflict = request_resource_conflict(root, new);
205 return conflict ? -EBUSY : 0; 251 return conflict ? -EBUSY : 0;
206} 252}
207 253
@@ -274,7 +320,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *)) 320 void *arg, int (*func)(unsigned long, unsigned long, void *))
275{ 321{
276 struct resource res; 322 struct resource res;
277 unsigned long pfn, len; 323 unsigned long pfn, end_pfn;
278 u64 orig_end; 324 u64 orig_end;
279 int ret = -1; 325 int ret = -1;
280 326
@@ -284,9 +330,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
284 orig_end = res.end; 330 orig_end = res.end;
285 while ((res.start < res.end) && 331 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) { 332 (find_next_system_ram(&res, "System RAM") >= 0)) {
287 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 333 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 334 end_pfn = (res.end + 1) >> PAGE_SHIFT;
289 ret = (*func)(pfn, len, arg); 335 if (end_pfn > pfn)
336 ret = (*func)(pfn, end_pfn - pfn, arg);
290 if (ret) 337 if (ret)
291 break; 338 break;
292 res.start = res.end + 1; 339 res.start = res.end + 1;
@@ -297,14 +344,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 344
298#endif 345#endif
299 346
347static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
348{
349 return 1;
350}
351/*
352 * This generic page_is_ram() returns true if specified address is
353 * registered as "System RAM" in iomem_resource list.
354 */
355int __weak page_is_ram(unsigned long pfn)
356{
357 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
358}
359
300/* 360/*
301 * Find empty slot in the resource tree given range and alignment. 361 * Find empty slot in the resource tree given range and alignment.
302 */ 362 */
303static int find_resource(struct resource *root, struct resource *new, 363static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 364 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 365 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 366 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 367 const struct resource *,
368 resource_size_t,
369 resource_size_t),
308 void *alignf_data) 370 void *alignf_data)
309{ 371{
310 struct resource *this = root->child; 372 struct resource *this = root->child;
@@ -330,7 +392,7 @@ static int find_resource(struct resource *root, struct resource *new,
330 tmp.end = max; 392 tmp.end = max;
331 tmp.start = ALIGN(tmp.start, align); 393 tmp.start = ALIGN(tmp.start, align);
332 if (alignf) 394 if (alignf)
333 alignf(alignf_data, &tmp, size, align); 395 tmp.start = alignf(alignf_data, &tmp, size, align);
334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { 396 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
335 new->start = tmp.start; 397 new->start = tmp.start;
336 new->end = tmp.start + size - 1; 398 new->end = tmp.start + size - 1;
@@ -358,8 +420,10 @@ static int find_resource(struct resource *root, struct resource *new,
358int allocate_resource(struct resource *root, struct resource *new, 420int allocate_resource(struct resource *root, struct resource *new,
359 resource_size_t size, resource_size_t min, 421 resource_size_t size, resource_size_t min,
360 resource_size_t max, resource_size_t align, 422 resource_size_t max, resource_size_t align,
361 void (*alignf)(void *, struct resource *, 423 resource_size_t (*alignf)(void *,
362 resource_size_t, resource_size_t), 424 const struct resource *,
425 resource_size_t,
426 resource_size_t),
363 void *alignf_data) 427 void *alignf_data)
364{ 428{
365 int err; 429 int err;
@@ -426,25 +490,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
426} 490}
427 491
428/** 492/**
429 * insert_resource - Inserts a resource in the resource tree 493 * insert_resource_conflict - Inserts resource in the resource tree
430 * @parent: parent of the new resource 494 * @parent: parent of the new resource
431 * @new: new resource to insert 495 * @new: new resource to insert
432 * 496 *
433 * Returns 0 on success, -EBUSY if the resource can't be inserted. 497 * Returns 0 on success, conflict resource if the resource can't be inserted.
434 * 498 *
435 * This function is equivalent to request_resource when no conflict 499 * This function is equivalent to request_resource_conflict when no conflict
436 * happens. If a conflict happens, and the conflicting resources 500 * happens. If a conflict happens, and the conflicting resources
437 * entirely fit within the range of the new resource, then the new 501 * entirely fit within the range of the new resource, then the new
438 * resource is inserted and the conflicting resources become children of 502 * resource is inserted and the conflicting resources become children of
439 * the new resource. 503 * the new resource.
440 */ 504 */
441int insert_resource(struct resource *parent, struct resource *new) 505struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
442{ 506{
443 struct resource *conflict; 507 struct resource *conflict;
444 508
445 write_lock(&resource_lock); 509 write_lock(&resource_lock);
446 conflict = __insert_resource(parent, new); 510 conflict = __insert_resource(parent, new);
447 write_unlock(&resource_lock); 511 write_unlock(&resource_lock);
512 return conflict;
513}
514
515/**
516 * insert_resource - Inserts a resource in the resource tree
517 * @parent: parent of the new resource
518 * @new: new resource to insert
519 *
520 * Returns 0 on success, -EBUSY if the resource can't be inserted.
521 */
522int insert_resource(struct resource *parent, struct resource *new)
523{
524 struct resource *conflict;
525
526 conflict = insert_resource_conflict(parent, new);
448 return conflict ? -EBUSY : 0; 527 return conflict ? -EBUSY : 0;
449} 528}
450 529
@@ -603,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
603 * release_region releases a matching busy region. 682 * release_region releases a matching busy region.
604 */ 683 */
605 684
685static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
686
606/** 687/**
607 * __request_region - create a new busy resource region 688 * __request_region - create a new busy resource region
608 * @parent: parent resource descriptor 689 * @parent: parent resource descriptor
@@ -615,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
615 resource_size_t start, resource_size_t n, 696 resource_size_t start, resource_size_t n,
616 const char *name, int flags) 697 const char *name, int flags)
617{ 698{
699 DECLARE_WAITQUEUE(wait, current);
618 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 700 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
619 701
620 if (!res) 702 if (!res)
@@ -639,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
639 if (!(conflict->flags & IORESOURCE_BUSY)) 721 if (!(conflict->flags & IORESOURCE_BUSY))
640 continue; 722 continue;
641 } 723 }
642 724 if (conflict->flags & flags & IORESOURCE_MUXED) {
725 add_wait_queue(&muxed_resource_wait, &wait);
726 write_unlock(&resource_lock);
727 set_current_state(TASK_UNINTERRUPTIBLE);
728 schedule();
729 remove_wait_queue(&muxed_resource_wait, &wait);
730 write_lock(&resource_lock);
731 continue;
732 }
643 /* Uhhuh, that didn't work out.. */ 733 /* Uhhuh, that didn't work out.. */
644 kfree(res); 734 kfree(res);
645 res = NULL; 735 res = NULL;
@@ -713,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
713 break; 803 break;
714 *p = res->sibling; 804 *p = res->sibling;
715 write_unlock(&resource_lock); 805 write_unlock(&resource_lock);
806 if (res->flags & IORESOURCE_MUXED)
807 wake_up(&muxed_resource_wait);
716 kfree(res); 808 kfree(res);
717 return; 809 return;
718 } 810 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 4508fe7048be..f52a8801b7a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <linux/slab.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -233,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 234 */
234static DEFINE_MUTEX(sched_domains_mutex); 235static DEFINE_MUTEX(sched_domains_mutex);
235 236
236#ifdef CONFIG_GROUP_SCHED 237#ifdef CONFIG_CGROUP_SCHED
237 238
238#include <linux/cgroup.h> 239#include <linux/cgroup.h>
239 240
@@ -243,13 +244,7 @@ static LIST_HEAD(task_groups);
243 244
244/* task group related information */ 245/* task group related information */
245struct task_group { 246struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 247 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 248
254#ifdef CONFIG_FAIR_GROUP_SCHED 249#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 250 /* schedulable entities of this group on each cpu */
@@ -274,35 +269,7 @@ struct task_group {
274 struct list_head children; 269 struct list_head children;
275}; 270};
276 271
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 272#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 273
307/* task_group_lock serializes add/remove of task groups and also changes to 274/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 275 * a task group's cpu shares.
@@ -318,11 +285,7 @@ static int root_task_group_empty(void)
318} 285}
319#endif 286#endif
320 287
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 289
327/* 290/*
328 * A weight of 0 or 1 can cause arithmetics problems. 291 * A weight of 0 or 1 can cause arithmetics problems.
@@ -343,47 +306,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
343 */ 306 */
344struct task_group init_task_group; 307struct task_group init_task_group;
345 308
346/* return group to which a task belongs */ 309#endif /* CONFIG_CGROUP_SCHED */
347static inline struct task_group *task_group(struct task_struct *p)
348{
349 struct task_group *tg;
350
351#ifdef CONFIG_USER_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css);
358#else
359 tg = &init_task_group;
360#endif
361 return tg;
362}
363
364/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
366{
367#ifdef CONFIG_FAIR_GROUP_SCHED
368 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
369 p->se.parent = task_group(p)->se[cpu];
370#endif
371
372#ifdef CONFIG_RT_GROUP_SCHED
373 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
374 p->rt.parent = task_group(p)->rt_se[cpu];
375#endif
376}
377
378#else
379
380static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
381static inline struct task_group *task_group(struct task_struct *p)
382{
383 return NULL;
384}
385
386#endif /* CONFIG_GROUP_SCHED */
387 310
388/* CFS-related fields in a runqueue */ 311/* CFS-related fields in a runqueue */
389struct cfs_rq { 312struct cfs_rq {
@@ -478,7 +401,6 @@ struct rt_rq {
478 struct rq *rq; 401 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 402 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 403 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 404#endif
483}; 405};
484 406
@@ -535,8 +457,11 @@ struct rq {
535 #define CPU_LOAD_IDX_MAX 5 457 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 458 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ 459#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp;
538 unsigned char in_nohz_recently; 461 unsigned char in_nohz_recently;
539#endif 462#endif
463 unsigned int skip_clock_update;
464
540 /* capture load from *all* tasks on this cpu: */ 465 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 466 struct load_weight load;
542 unsigned long nr_load_updates; 467 unsigned long nr_load_updates;
@@ -573,20 +498,20 @@ struct rq {
573 struct root_domain *rd; 498 struct root_domain *rd;
574 struct sched_domain *sd; 499 struct sched_domain *sd;
575 500
501 unsigned long cpu_power;
502
576 unsigned char idle_at_tick; 503 unsigned char idle_at_tick;
577 /* For active balancing */ 504 /* For active balancing */
578 int post_schedule; 505 int post_schedule;
579 int active_balance; 506 int active_balance;
580 int push_cpu; 507 int push_cpu;
508 struct cpu_stop_work active_balance_work;
581 /* cpu of this runqueue: */ 509 /* cpu of this runqueue: */
582 int cpu; 510 int cpu;
583 int online; 511 int online;
584 512
585 unsigned long avg_load_per_task; 513 unsigned long avg_load_per_task;
586 514
587 struct task_struct *migration_thread;
588 struct list_head migration_queue;
589
590 u64 rt_avg; 515 u64 rt_avg;
591 u64 age_stamp; 516 u64 age_stamp;
592 u64 idle_stamp; 517 u64 idle_stamp;
@@ -634,6 +559,13 @@ static inline
634void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 559void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
635{ 560{
636 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 561 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
562
563 /*
564 * A queue event has occurred, and we're going to schedule. In
565 * this case, we can save a useless back to back clock update.
566 */
567 if (test_tsk_need_resched(p))
568 rq->skip_clock_update = 1;
637} 569}
638 570
639static inline int cpu_of(struct rq *rq) 571static inline int cpu_of(struct rq *rq)
@@ -645,6 +577,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 577#endif
646} 578}
647 579
580#define rcu_dereference_check_sched_domain(p) \
581 rcu_dereference_check((p), \
582 rcu_read_lock_sched_held() || \
583 lockdep_is_held(&sched_domains_mutex))
584
648/* 585/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 586 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 587 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +590,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 590 * preempt-disabled sections.
654 */ 591 */
655#define for_each_domain(cpu, __sd) \ 592#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 593 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 594
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 595#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 596#define this_rq() (&__get_cpu_var(runqueues))
@@ -661,9 +598,53 @@ static inline int cpu_of(struct rq *rq)
661#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 598#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
662#define raw_rq() (&__raw_get_cpu_var(runqueues)) 599#define raw_rq() (&__raw_get_cpu_var(runqueues))
663 600
601#ifdef CONFIG_CGROUP_SCHED
602
603/*
604 * Return the group to which this tasks belongs.
605 *
606 * We use task_subsys_state_check() and extend the RCU verification
607 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
608 * holds that lock for each task it moves into the cgroup. Therefore
609 * by holding that lock, we pin the task to the current cgroup.
610 */
611static inline struct task_group *task_group(struct task_struct *p)
612{
613 struct cgroup_subsys_state *css;
614
615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616 lockdep_is_held(&task_rq(p)->lock));
617 return container_of(css, struct task_group, css);
618}
619
620/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
621static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
622{
623#ifdef CONFIG_FAIR_GROUP_SCHED
624 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
625 p->se.parent = task_group(p)->se[cpu];
626#endif
627
628#ifdef CONFIG_RT_GROUP_SCHED
629 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
630 p->rt.parent = task_group(p)->rt_se[cpu];
631#endif
632}
633
634#else /* CONFIG_CGROUP_SCHED */
635
636static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
637static inline struct task_group *task_group(struct task_struct *p)
638{
639 return NULL;
640}
641
642#endif /* CONFIG_CGROUP_SCHED */
643
664inline void update_rq_clock(struct rq *rq) 644inline void update_rq_clock(struct rq *rq)
665{ 645{
666 rq->clock = sched_clock_cpu(cpu_of(rq)); 646 if (!rq->skip_clock_update)
647 rq->clock = sched_clock_cpu(cpu_of(rq));
667} 648}
668 649
669/* 650/*
@@ -941,14 +922,25 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 922#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 923
943/* 924/*
925 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
926 * against ttwu().
927 */
928static inline int task_is_waking(struct task_struct *p)
929{
930 return unlikely(p->state == TASK_WAKING);
931}
932
933/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 934 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 935 * Must be called interrupts disabled.
946 */ 936 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 937static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 938 __acquires(rq->lock)
949{ 939{
940 struct rq *rq;
941
950 for (;;) { 942 for (;;) {
951 struct rq *rq = task_rq(p); 943 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 944 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 945 if (likely(rq == task_rq(p)))
954 return rq; 946 return rq;
@@ -976,14 +968,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
976 } 968 }
977} 969}
978 970
979void task_rq_unlock_wait(struct task_struct *p)
980{
981 struct rq *rq = task_rq(p);
982
983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
984 raw_spin_unlock_wait(&rq->lock);
985}
986
987static void __task_rq_unlock(struct rq *rq) 971static void __task_rq_unlock(struct rq *rq)
988 __releases(rq->lock) 972 __releases(rq->lock)
989{ 973{
@@ -1247,6 +1231,17 @@ void wake_up_idle_cpu(int cpu)
1247 if (!tsk_is_polling(rq->idle)) 1231 if (!tsk_is_polling(rq->idle))
1248 smp_send_reschedule(cpu); 1232 smp_send_reschedule(cpu);
1249} 1233}
1234
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1250#endif /* CONFIG_NO_HZ */ 1245#endif /* CONFIG_NO_HZ */
1251 1246
1252static u64 sched_avg_period(void) 1247static u64 sched_avg_period(void)
@@ -1259,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
1259 s64 period = sched_avg_period(); 1254 s64 period = sched_avg_period();
1260 1255
1261 while ((s64)(rq->clock - rq->age_stamp) > period) { 1256 while ((s64)(rq->clock - rq->age_stamp) > period) {
1257 /*
1258 * Inline assembly required to prevent the compiler
1259 * optimising this loop into a divmod call.
1260 * See __iter_div_u64_rem() for another example of this.
1261 */
1262 asm("" : "+rm" (rq->age_stamp));
1262 rq->age_stamp += period; 1263 rq->age_stamp += period;
1263 rq->rt_avg /= 2; 1264 rq->rt_avg /= 2;
1264 } 1265 }
@@ -1390,32 +1391,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1391 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1392};
1392 1393
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1394/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1395enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1396 CPUACCT_STAT_USER, /* ... user mode */
@@ -1529,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
1529 return max(rq->cpu_load[type-1], total); 1504 return max(rq->cpu_load[type-1], total);
1530} 1505}
1531 1506
1532static struct sched_group *group_of(int cpu)
1533{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1535
1536 if (!sd)
1537 return NULL;
1538
1539 return sd->groups;
1540}
1541
1542static unsigned long power_of(int cpu) 1507static unsigned long power_of(int cpu)
1543{ 1508{
1544 struct sched_group *group = group_of(cpu); 1509 return cpu_rq(cpu)->cpu_power;
1545
1546 if (!group)
1547 return SCHED_LOAD_SCALE;
1548
1549 return group->cpu_power;
1550} 1510}
1551 1511
1552static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1512static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1566,7 +1526,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1526
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1527#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1528
1569static __read_mostly unsigned long *update_shares_data; 1529static __read_mostly unsigned long __percpu *update_shares_data;
1570 1530
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1531static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1532
@@ -1701,21 +1661,8 @@ static void update_shares(struct sched_domain *sd)
1701 } 1661 }
1702} 1662}
1703 1663
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1664static void update_h_load(long cpu)
1715{ 1665{
1716 if (root_task_group_empty())
1717 return;
1718
1719 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1666 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1720} 1667}
1721 1668
@@ -1725,10 +1672,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1672{
1726} 1673}
1727 1674
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1675#endif
1733 1676
1734#ifdef CONFIG_PREEMPT 1677#ifdef CONFIG_PREEMPT
@@ -1805,6 +1748,49 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1748 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1749 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1750}
1751
1752/*
1753 * double_rq_lock - safely lock two runqueues
1754 *
1755 * Note this does not disable interrupts like task_rq_lock,
1756 * you need to do so manually before calling.
1757 */
1758static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1759 __acquires(rq1->lock)
1760 __acquires(rq2->lock)
1761{
1762 BUG_ON(!irqs_disabled());
1763 if (rq1 == rq2) {
1764 raw_spin_lock(&rq1->lock);
1765 __acquire(rq2->lock); /* Fake it out ;) */
1766 } else {
1767 if (rq1 < rq2) {
1768 raw_spin_lock(&rq1->lock);
1769 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1770 } else {
1771 raw_spin_lock(&rq2->lock);
1772 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1773 }
1774 }
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1816,7 +1802,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1816} 1802}
1817#endif 1803#endif
1818 1804
1819static void calc_load_account_active(struct rq *this_rq); 1805static void calc_load_account_idle(struct rq *this_rq);
1820static void update_sysctl(void); 1806static void update_sysctl(void);
1821static int get_update_sysctl_factor(void); 1807static int get_update_sysctl_factor(void);
1822 1808
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1859,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
1859static void set_load_weight(struct task_struct *p) 1841static void set_load_weight(struct task_struct *p)
1860{ 1842{
1861 if (task_has_rt_policy(p)) { 1843 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = prio_to_weight[0] * 2; 1844 p->se.load.weight = 0;
1863 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1845 p->se.load.inv_weight = WMULT_CONST;
1864 return; 1846 return;
1865 } 1847 }
1866 1848
@@ -1877,40 +1859,53 @@ static void set_load_weight(struct task_struct *p)
1877 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1859 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1878} 1860}
1879 1861
1880static void update_avg(u64 *avg, u64 sample) 1862static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1881{ 1863{
1882 s64 diff = sample - *avg; 1864 update_rq_clock(rq);
1883 *avg += diff >> 3; 1865 sched_info_queued(p);
1866 p->sched_class->enqueue_task(rq, p, flags);
1867 p->se.on_rq = 1;
1884} 1868}
1885 1869
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1870static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1887{ 1871{
1888 if (wakeup) 1872 update_rq_clock(rq);
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1873 sched_info_dequeued(p);
1874 p->sched_class->dequeue_task(rq, p, flags);
1875 p->se.on_rq = 0;
1876}
1890 1877
1891 sched_info_queued(p); 1878/*
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1879 * activate_task - move a task to the runqueue.
1893 p->se.on_rq = 1; 1880 */
1881static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1882{
1883 if (task_contributes_to_load(p))
1884 rq->nr_uninterruptible--;
1885
1886 enqueue_task(rq, p, flags);
1887 inc_nr_running(rq);
1894} 1888}
1895 1889
1896static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1890/*
1891 * deactivate_task - remove a task from the runqueue.
1892 */
1893static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1897{ 1894{
1898 if (sleep) { 1895 if (task_contributes_to_load(p))
1899 if (p->se.last_wakeup) { 1896 rq->nr_uninterruptible++;
1900 update_avg(&p->se.avg_overlap,
1901 p->se.sum_exec_runtime - p->se.last_wakeup);
1902 p->se.last_wakeup = 0;
1903 } else {
1904 update_avg(&p->se.avg_wakeup,
1905 sysctl_sched_wakeup_granularity);
1906 }
1907 }
1908 1897
1909 sched_info_dequeued(p); 1898 dequeue_task(rq, p, flags);
1910 p->sched_class->dequeue_task(rq, p, sleep); 1899 dec_nr_running(rq);
1911 p->se.on_rq = 0;
1912} 1900}
1913 1901
1902#include "sched_idletask.c"
1903#include "sched_fair.c"
1904#include "sched_rt.c"
1905#ifdef CONFIG_SCHED_DEBUG
1906# include "sched_debug.c"
1907#endif
1908
1914/* 1909/*
1915 * __normal_prio - return the priority that is based on the static prio 1910 * __normal_prio - return the priority that is based on the static prio
1916 */ 1911 */
@@ -1957,30 +1952,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1952 return p->prio;
1958} 1953}
1959 1954
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1955/**
1985 * task_curr - is this task currently executing on a CPU? 1956 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1957 * @p: the task in question.
@@ -2053,21 +2024,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2053 __set_task_cpu(p, new_cpu); 2024 __set_task_cpu(p, new_cpu);
2054} 2025}
2055 2026
2056struct migration_req { 2027struct migration_arg {
2057 struct list_head list;
2058
2059 struct task_struct *task; 2028 struct task_struct *task;
2060 int dest_cpu; 2029 int dest_cpu;
2061
2062 struct completion done;
2063}; 2030};
2064 2031
2032static int migration_cpu_stop(void *data);
2033
2065/* 2034/*
2066 * The task's runqueue lock must be held. 2035 * The task's runqueue lock must be held.
2067 * Returns true if you have to wait for migration thread. 2036 * Returns true if you have to wait for migration thread.
2068 */ 2037 */
2069static int 2038static bool migrate_task(struct task_struct *p, int dest_cpu)
2070migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2071{ 2039{
2072 struct rq *rq = task_rq(p); 2040 struct rq *rq = task_rq(p);
2073 2041
@@ -2075,58 +2043,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2075 * If the task is not on a runqueue (and not running), then 2043 * If the task is not on a runqueue (and not running), then
2076 * the next wake-up will properly place the task. 2044 * the next wake-up will properly place the task.
2077 */ 2045 */
2078 if (!p->se.on_rq && !task_running(rq, p)) 2046 return p->se.on_rq || task_running(rq, p);
2079 return 0;
2080
2081 init_completion(&req->done);
2082 req->task = p;
2083 req->dest_cpu = dest_cpu;
2084 list_add(&req->list, &rq->migration_queue);
2085
2086 return 1;
2087}
2088
2089/*
2090 * wait_task_context_switch - wait for a thread to complete at least one
2091 * context switch.
2092 *
2093 * @p must not be current.
2094 */
2095void wait_task_context_switch(struct task_struct *p)
2096{
2097 unsigned long nvcsw, nivcsw, flags;
2098 int running;
2099 struct rq *rq;
2100
2101 nvcsw = p->nvcsw;
2102 nivcsw = p->nivcsw;
2103 for (;;) {
2104 /*
2105 * The runqueue is assigned before the actual context
2106 * switch. We need to take the runqueue lock.
2107 *
2108 * We could check initially without the lock but it is
2109 * very likely that we need to take the lock in every
2110 * iteration.
2111 */
2112 rq = task_rq_lock(p, &flags);
2113 running = task_running(rq, p);
2114 task_rq_unlock(rq, &flags);
2115
2116 if (likely(!running))
2117 break;
2118 /*
2119 * The switch count is incremented before the actual
2120 * context switch. We thus wait for two switches to be
2121 * sure at least one completed.
2122 */
2123 if ((p->nvcsw - nvcsw) > 1)
2124 break;
2125 if ((p->nivcsw - nivcsw) > 1)
2126 break;
2127
2128 cpu_relax();
2129 }
2130} 2047}
2131 2048
2132/* 2049/*
@@ -2184,7 +2101,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2184 * just go back and repeat. 2101 * just go back and repeat.
2185 */ 2102 */
2186 rq = task_rq_lock(p, &flags); 2103 rq = task_rq_lock(p, &flags);
2187 trace_sched_wait_task(rq, p); 2104 trace_sched_wait_task(p);
2188 running = task_running(rq, p); 2105 running = task_running(rq, p);
2189 on_rq = p->se.on_rq; 2106 on_rq = p->se.on_rq;
2190 ncsw = 0; 2107 ncsw = 0;
@@ -2282,6 +2199,9 @@ void task_oncpu_function_call(struct task_struct *p,
2282} 2199}
2283 2200
2284#ifdef CONFIG_SMP 2201#ifdef CONFIG_SMP
2202/*
2203 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2204 */
2285static int select_fallback_rq(int cpu, struct task_struct *p) 2205static int select_fallback_rq(int cpu, struct task_struct *p)
2286{ 2206{
2287 int dest_cpu; 2207 int dest_cpu;
@@ -2298,12 +2218,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2298 return dest_cpu; 2218 return dest_cpu;
2299 2219
2300 /* No more Mr. Nice Guy. */ 2220 /* No more Mr. Nice Guy. */
2301 if (dest_cpu >= nr_cpu_ids) { 2221 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2302 rcu_read_lock(); 2222 dest_cpu = cpuset_cpus_allowed_fallback(p);
2303 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2304 rcu_read_unlock();
2305 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2306
2307 /* 2223 /*
2308 * Don't tell them about moving exiting tasks or 2224 * Don't tell them about moving exiting tasks or
2309 * kernel threads (both mm NULL), since they never 2225 * kernel threads (both mm NULL), since they never
@@ -2320,19 +2236,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2320} 2236}
2321 2237
2322/* 2238/*
2323 * Called from: 2239 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2324 *
2325 * - fork, @p is stable because it isn't on the tasklist yet
2326 *
2327 * - exec, @p is unstable, retry loop
2328 *
2329 * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
2330 * we should be good.
2331 */ 2240 */
2332static inline 2241static inline
2333int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2242int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2334{ 2243{
2335 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2244 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2336 2245
2337 /* 2246 /*
2338 * In order not to call set_task_cpu() on a blocking task we need 2247 * In order not to call set_task_cpu() on a blocking task we need
@@ -2350,6 +2259,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2350 2259
2351 return cpu; 2260 return cpu;
2352} 2261}
2262
2263static void update_avg(u64 *avg, u64 sample)
2264{
2265 s64 diff = sample - *avg;
2266 *avg += diff >> 3;
2267}
2353#endif 2268#endif
2354 2269
2355/*** 2270/***
@@ -2371,16 +2286,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2371{ 2286{
2372 int cpu, orig_cpu, this_cpu, success = 0; 2287 int cpu, orig_cpu, this_cpu, success = 0;
2373 unsigned long flags; 2288 unsigned long flags;
2374 struct rq *rq, *orig_rq; 2289 unsigned long en_flags = ENQUEUE_WAKEUP;
2375 2290 struct rq *rq;
2376 if (!sched_feat(SYNC_WAKEUPS))
2377 wake_flags &= ~WF_SYNC;
2378 2291
2379 this_cpu = get_cpu(); 2292 this_cpu = get_cpu();
2380 2293
2381 smp_wmb(); 2294 smp_wmb();
2382 rq = orig_rq = task_rq_lock(p, &flags); 2295 rq = task_rq_lock(p, &flags);
2383 update_rq_clock(rq);
2384 if (!(p->state & state)) 2296 if (!(p->state & state))
2385 goto out; 2297 goto out;
2386 2298
@@ -2400,24 +2312,35 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2400 * 2312 *
2401 * First fix up the nr_uninterruptible count: 2313 * First fix up the nr_uninterruptible count:
2402 */ 2314 */
2403 if (task_contributes_to_load(p)) 2315 if (task_contributes_to_load(p)) {
2404 rq->nr_uninterruptible--; 2316 if (likely(cpu_online(orig_cpu)))
2317 rq->nr_uninterruptible--;
2318 else
2319 this_rq()->nr_uninterruptible--;
2320 }
2405 p->state = TASK_WAKING; 2321 p->state = TASK_WAKING;
2406 2322
2407 if (p->sched_class->task_waking) 2323 if (p->sched_class->task_waking) {
2408 p->sched_class->task_waking(rq, p); 2324 p->sched_class->task_waking(rq, p);
2325 en_flags |= ENQUEUE_WAKING;
2326 }
2409 2327
2410 __task_rq_unlock(rq); 2328 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2411
2412 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2413 if (cpu != orig_cpu) 2329 if (cpu != orig_cpu)
2414 set_task_cpu(p, cpu); 2330 set_task_cpu(p, cpu);
2331 __task_rq_unlock(rq);
2415 2332
2416 rq = __task_rq_lock(p); 2333 rq = cpu_rq(cpu);
2417 update_rq_clock(rq); 2334 raw_spin_lock(&rq->lock);
2418 2335
2336 /*
2337 * We migrated the task without holding either rq->lock, however
2338 * since the task is not on the task list itself, nobody else
2339 * will try and migrate the task, hence the rq should match the
2340 * cpu we just moved it to.
2341 */
2342 WARN_ON(task_cpu(p) != cpu);
2419 WARN_ON(p->state != TASK_WAKING); 2343 WARN_ON(p->state != TASK_WAKING);
2420 cpu = task_cpu(p);
2421 2344
2422#ifdef CONFIG_SCHEDSTATS 2345#ifdef CONFIG_SCHEDSTATS
2423 schedstat_inc(rq, ttwu_count); 2346 schedstat_inc(rq, ttwu_count);
@@ -2436,36 +2359,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2436 2359
2437out_activate: 2360out_activate:
2438#endif /* CONFIG_SMP */ 2361#endif /* CONFIG_SMP */
2439 schedstat_inc(p, se.nr_wakeups); 2362 schedstat_inc(p, se.statistics.nr_wakeups);
2440 if (wake_flags & WF_SYNC) 2363 if (wake_flags & WF_SYNC)
2441 schedstat_inc(p, se.nr_wakeups_sync); 2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2442 if (orig_cpu != cpu) 2365 if (orig_cpu != cpu)
2443 schedstat_inc(p, se.nr_wakeups_migrate); 2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2444 if (cpu == this_cpu) 2367 if (cpu == this_cpu)
2445 schedstat_inc(p, se.nr_wakeups_local); 2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2446 else 2369 else
2447 schedstat_inc(p, se.nr_wakeups_remote); 2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2448 activate_task(rq, p, 1); 2371 activate_task(rq, p, en_flags);
2449 success = 1; 2372 success = 1;
2450 2373
2451 /*
2452 * Only attribute actual wakeups done by this task.
2453 */
2454 if (!in_interrupt()) {
2455 struct sched_entity *se = &current->se;
2456 u64 sample = se->sum_exec_runtime;
2457
2458 if (se->last_wakeup)
2459 sample -= se->last_wakeup;
2460 else
2461 sample -= se->start_runtime;
2462 update_avg(&se->avg_wakeup, sample);
2463
2464 se->last_wakeup = se->sum_exec_runtime;
2465 }
2466
2467out_running: 2374out_running:
2468 trace_sched_wakeup(rq, p, success); 2375 trace_sched_wakeup(p, success);
2469 check_preempt_curr(rq, p, wake_flags); 2376 check_preempt_curr(rq, p, wake_flags);
2470 2377
2471 p->state = TASK_RUNNING; 2378 p->state = TASK_RUNNING;
@@ -2525,42 +2432,9 @@ static void __sched_fork(struct task_struct *p)
2525 p->se.sum_exec_runtime = 0; 2432 p->se.sum_exec_runtime = 0;
2526 p->se.prev_sum_exec_runtime = 0; 2433 p->se.prev_sum_exec_runtime = 0;
2527 p->se.nr_migrations = 0; 2434 p->se.nr_migrations = 0;
2528 p->se.last_wakeup = 0;
2529 p->se.avg_overlap = 0;
2530 p->se.start_runtime = 0;
2531 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2532 2435
2533#ifdef CONFIG_SCHEDSTATS 2436#ifdef CONFIG_SCHEDSTATS
2534 p->se.wait_start = 0; 2437 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2535 p->se.wait_max = 0;
2536 p->se.wait_count = 0;
2537 p->se.wait_sum = 0;
2538
2539 p->se.sleep_start = 0;
2540 p->se.sleep_max = 0;
2541 p->se.sum_sleep_runtime = 0;
2542
2543 p->se.block_start = 0;
2544 p->se.block_max = 0;
2545 p->se.exec_max = 0;
2546 p->se.slice_max = 0;
2547
2548 p->se.nr_migrations_cold = 0;
2549 p->se.nr_failed_migrations_affine = 0;
2550 p->se.nr_failed_migrations_running = 0;
2551 p->se.nr_failed_migrations_hot = 0;
2552 p->se.nr_forced_migrations = 0;
2553
2554 p->se.nr_wakeups = 0;
2555 p->se.nr_wakeups_sync = 0;
2556 p->se.nr_wakeups_migrate = 0;
2557 p->se.nr_wakeups_local = 0;
2558 p->se.nr_wakeups_remote = 0;
2559 p->se.nr_wakeups_affine = 0;
2560 p->se.nr_wakeups_affine_attempts = 0;
2561 p->se.nr_wakeups_passive = 0;
2562 p->se.nr_wakeups_idle = 0;
2563
2564#endif 2438#endif
2565 2439
2566 INIT_LIST_HEAD(&p->rt.run_list); 2440 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2581,11 +2455,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2581 2455
2582 __sched_fork(p); 2456 __sched_fork(p);
2583 /* 2457 /*
2584 * We mark the process as waking here. This guarantees that 2458 * We mark the process as running here. This guarantees that
2585 * nobody will actually run it, and a signal or other external 2459 * nobody will actually run it, and a signal or other external
2586 * event cannot wake it up and insert it on the runqueue either. 2460 * event cannot wake it up and insert it on the runqueue either.
2587 */ 2461 */
2588 p->state = TASK_WAKING; 2462 p->state = TASK_RUNNING;
2589 2463
2590 /* 2464 /*
2591 * Revert to default priority/policy on fork if requested. 2465 * Revert to default priority/policy on fork if requested.
@@ -2620,10 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2620 if (p->sched_class->task_fork) 2494 if (p->sched_class->task_fork)
2621 p->sched_class->task_fork(p); 2495 p->sched_class->task_fork(p);
2622 2496
2623#ifdef CONFIG_SMP 2497 /*
2624 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2498 * The child is not yet in the pid-hash so no cgroup attach races,
2625#endif 2499 * and the cgroup is pinned to this child due to cgroup_fork()
2500 * is ran before sched_fork().
2501 *
2502 * Silence PROVE_RCU.
2503 */
2504 rcu_read_lock();
2626 set_task_cpu(p, cpu); 2505 set_task_cpu(p, cpu);
2506 rcu_read_unlock();
2627 2507
2628#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2508#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2629 if (likely(sched_info_on())) 2509 if (likely(sched_info_on()))
@@ -2652,19 +2532,37 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2652{ 2532{
2653 unsigned long flags; 2533 unsigned long flags;
2654 struct rq *rq; 2534 struct rq *rq;
2535 int cpu __maybe_unused = get_cpu();
2655 2536
2537#ifdef CONFIG_SMP
2656 rq = task_rq_lock(p, &flags); 2538 rq = task_rq_lock(p, &flags);
2657 BUG_ON(p->state != TASK_WAKING); 2539 p->state = TASK_WAKING;
2540
2541 /*
2542 * Fork balancing, do it here and not earlier because:
2543 * - cpus_allowed can change in the fork path
2544 * - any previously selected cpu might disappear through hotplug
2545 *
2546 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2547 * without people poking at ->cpus_allowed.
2548 */
2549 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2550 set_task_cpu(p, cpu);
2551
2658 p->state = TASK_RUNNING; 2552 p->state = TASK_RUNNING;
2659 update_rq_clock(rq); 2553 task_rq_unlock(rq, &flags);
2554#endif
2555
2556 rq = task_rq_lock(p, &flags);
2660 activate_task(rq, p, 0); 2557 activate_task(rq, p, 0);
2661 trace_sched_wakeup_new(rq, p, 1); 2558 trace_sched_wakeup_new(p, 1);
2662 check_preempt_curr(rq, p, WF_FORK); 2559 check_preempt_curr(rq, p, WF_FORK);
2663#ifdef CONFIG_SMP 2560#ifdef CONFIG_SMP
2664 if (p->sched_class->task_woken) 2561 if (p->sched_class->task_woken)
2665 p->sched_class->task_woken(rq, p); 2562 p->sched_class->task_woken(rq, p);
2666#endif 2563#endif
2667 task_rq_unlock(rq, &flags); 2564 task_rq_unlock(rq, &flags);
2565 put_cpu();
2668} 2566}
2669 2567
2670#ifdef CONFIG_PREEMPT_NOTIFIERS 2568#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2783,7 +2681,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2783 */ 2681 */
2784 prev_state = prev->state; 2682 prev_state = prev->state;
2785 finish_arch_switch(prev); 2683 finish_arch_switch(prev);
2786 perf_event_task_sched_in(current, cpu_of(rq)); 2684#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2685 local_irq_disable();
2686#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2687 perf_event_task_sched_in(current);
2688#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2689 local_irq_enable();
2690#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2787 finish_lock_switch(rq, prev); 2691 finish_lock_switch(rq, prev);
2788 2692
2789 fire_sched_in_preempt_notifiers(current); 2693 fire_sched_in_preempt_notifiers(current);
@@ -2871,7 +2775,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2871 struct mm_struct *mm, *oldmm; 2775 struct mm_struct *mm, *oldmm;
2872 2776
2873 prepare_task_switch(rq, prev, next); 2777 prepare_task_switch(rq, prev, next);
2874 trace_sched_switch(rq, prev, next); 2778 trace_sched_switch(prev, next);
2875 mm = next->mm; 2779 mm = next->mm;
2876 oldmm = prev->active_mm; 2780 oldmm = prev->active_mm;
2877 /* 2781 /*
@@ -2969,9 +2873,9 @@ unsigned long nr_iowait(void)
2969 return sum; 2873 return sum;
2970} 2874}
2971 2875
2972unsigned long nr_iowait_cpu(void) 2876unsigned long nr_iowait_cpu(int cpu)
2973{ 2877{
2974 struct rq *this = this_rq(); 2878 struct rq *this = cpu_rq(cpu);
2975 return atomic_read(&this->nr_iowait); 2879 return atomic_read(&this->nr_iowait);
2976} 2880}
2977 2881
@@ -2988,6 +2892,61 @@ static unsigned long calc_load_update;
2988unsigned long avenrun[3]; 2892unsigned long avenrun[3];
2989EXPORT_SYMBOL(avenrun); 2893EXPORT_SYMBOL(avenrun);
2990 2894
2895static long calc_load_fold_active(struct rq *this_rq)
2896{
2897 long nr_active, delta = 0;
2898
2899 nr_active = this_rq->nr_running;
2900 nr_active += (long) this_rq->nr_uninterruptible;
2901
2902 if (nr_active != this_rq->calc_load_active) {
2903 delta = nr_active - this_rq->calc_load_active;
2904 this_rq->calc_load_active = nr_active;
2905 }
2906
2907 return delta;
2908}
2909
2910#ifdef CONFIG_NO_HZ
2911/*
2912 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2913 *
2914 * When making the ILB scale, we should try to pull this in as well.
2915 */
2916static atomic_long_t calc_load_tasks_idle;
2917
2918static void calc_load_account_idle(struct rq *this_rq)
2919{
2920 long delta;
2921
2922 delta = calc_load_fold_active(this_rq);
2923 if (delta)
2924 atomic_long_add(delta, &calc_load_tasks_idle);
2925}
2926
2927static long calc_load_fold_idle(void)
2928{
2929 long delta = 0;
2930
2931 /*
2932 * Its got a race, we don't care...
2933 */
2934 if (atomic_long_read(&calc_load_tasks_idle))
2935 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2936
2937 return delta;
2938}
2939#else
2940static void calc_load_account_idle(struct rq *this_rq)
2941{
2942}
2943
2944static inline long calc_load_fold_idle(void)
2945{
2946 return 0;
2947}
2948#endif
2949
2991/** 2950/**
2992 * get_avenrun - get the load average array 2951 * get_avenrun - get the load average array
2993 * @loads: pointer to dest load array 2952 * @loads: pointer to dest load array
@@ -3034,20 +2993,22 @@ void calc_global_load(void)
3034} 2993}
3035 2994
3036/* 2995/*
3037 * Either called from update_cpu_load() or from a cpu going idle 2996 * Called from update_cpu_load() to periodically update this CPU's
2997 * active count.
3038 */ 2998 */
3039static void calc_load_account_active(struct rq *this_rq) 2999static void calc_load_account_active(struct rq *this_rq)
3040{ 3000{
3041 long nr_active, delta; 3001 long delta;
3042 3002
3043 nr_active = this_rq->nr_running; 3003 if (time_before(jiffies, this_rq->calc_load_update))
3044 nr_active += (long) this_rq->nr_uninterruptible; 3004 return;
3045 3005
3046 if (nr_active != this_rq->calc_load_active) { 3006 delta = calc_load_fold_active(this_rq);
3047 delta = nr_active - this_rq->calc_load_active; 3007 delta += calc_load_fold_idle();
3048 this_rq->calc_load_active = nr_active; 3008 if (delta)
3049 atomic_long_add(delta, &calc_load_tasks); 3009 atomic_long_add(delta, &calc_load_tasks);
3050 } 3010
3011 this_rq->calc_load_update += LOAD_FREQ;
3051} 3012}
3052 3013
3053/* 3014/*
@@ -3079,1871 +3040,42 @@ static void update_cpu_load(struct rq *this_rq)
3079 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3080 } 3041 }
3081 3042
3082 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3043 calc_load_account_active(this_rq);
3083 this_rq->calc_load_update += LOAD_FREQ;
3084 calc_load_account_active(this_rq);
3085 }
3086} 3044}
3087 3045
3088#ifdef CONFIG_SMP 3046#ifdef CONFIG_SMP
3089 3047
3090/* 3048/*
3091 * double_rq_lock - safely lock two runqueues
3092 *
3093 * Note this does not disable interrupts like task_rq_lock,
3094 * you need to do so manually before calling.
3095 */
3096static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3097 __acquires(rq1->lock)
3098 __acquires(rq2->lock)
3099{
3100 BUG_ON(!irqs_disabled());
3101 if (rq1 == rq2) {
3102 raw_spin_lock(&rq1->lock);
3103 __acquire(rq2->lock); /* Fake it out ;) */
3104 } else {
3105 if (rq1 < rq2) {
3106 raw_spin_lock(&rq1->lock);
3107 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3108 } else {
3109 raw_spin_lock(&rq2->lock);
3110 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3111 }
3112 }
3113 update_rq_clock(rq1);
3114 update_rq_clock(rq2);
3115}
3116
3117/*
3118 * double_rq_unlock - safely unlock two runqueues
3119 *
3120 * Note this does not restore interrupts like task_rq_unlock,
3121 * you need to do so manually after calling.
3122 */
3123static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3124 __releases(rq1->lock)
3125 __releases(rq2->lock)
3126{
3127 raw_spin_unlock(&rq1->lock);
3128 if (rq1 != rq2)
3129 raw_spin_unlock(&rq2->lock);
3130 else
3131 __release(rq2->lock);
3132}
3133
3134/*
3135 * sched_exec - execve() is a valuable balancing opportunity, because at 3049 * sched_exec - execve() is a valuable balancing opportunity, because at
3136 * this point the task has the smallest effective memory and cache footprint. 3050 * this point the task has the smallest effective memory and cache footprint.
3137 */ 3051 */
3138void sched_exec(void) 3052void sched_exec(void)
3139{ 3053{
3140 struct task_struct *p = current; 3054 struct task_struct *p = current;
3141 struct migration_req req;
3142 int dest_cpu, this_cpu;
3143 unsigned long flags; 3055 unsigned long flags;
3144 struct rq *rq; 3056 struct rq *rq;
3145 3057 int dest_cpu;
3146again:
3147 this_cpu = get_cpu();
3148 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3149 if (dest_cpu == this_cpu) {
3150 put_cpu();
3151 return;
3152 }
3153 3058
3154 rq = task_rq_lock(p, &flags); 3059 rq = task_rq_lock(p, &flags);
3155 put_cpu(); 3060 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3061 if (dest_cpu == smp_processor_id())
3062 goto unlock;
3156 3063
3157 /* 3064 /*
3158 * select_task_rq() can race against ->cpus_allowed 3065 * select_task_rq() can race against ->cpus_allowed
3159 */ 3066 */
3160 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3067 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3161 || unlikely(!cpu_active(dest_cpu))) { 3068 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3162 task_rq_unlock(rq, &flags); 3069 struct migration_arg arg = { p, dest_cpu };
3163 goto again;
3164 }
3165 3070
3166 /* force the process onto the specified CPU */
3167 if (migrate_task(p, dest_cpu, &req)) {
3168 /* Need to wait for migration thread (might exit: take ref). */
3169 struct task_struct *mt = rq->migration_thread;
3170
3171 get_task_struct(mt);
3172 task_rq_unlock(rq, &flags); 3071 task_rq_unlock(rq, &flags);
3173 wake_up_process(mt); 3072 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3174 put_task_struct(mt);
3175 wait_for_completion(&req.done);
3176
3177 return; 3073 return;
3178 } 3074 }
3075unlock:
3179 task_rq_unlock(rq, &flags); 3076 task_rq_unlock(rq, &flags);
3180} 3077}
3181 3078
3182/*
3183 * pull_task - move a task from a remote runqueue to the local runqueue.
3184 * Both runqueues must be locked.
3185 */
3186static void pull_task(struct rq *src_rq, struct task_struct *p,
3187 struct rq *this_rq, int this_cpu)
3188{
3189 deactivate_task(src_rq, p, 0);
3190 set_task_cpu(p, this_cpu);
3191 activate_task(this_rq, p, 0);
3192 check_preempt_curr(this_rq, p, 0);
3193}
3194
3195/*
3196 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3197 */
3198static
3199int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3200 struct sched_domain *sd, enum cpu_idle_type idle,
3201 int *all_pinned)
3202{
3203 int tsk_cache_hot = 0;
3204 /*
3205 * We do not migrate tasks that are:
3206 * 1) running (obviously), or
3207 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3208 * 3) are cache-hot on their current CPU.
3209 */
3210 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3211 schedstat_inc(p, se.nr_failed_migrations_affine);
3212 return 0;
3213 }
3214 *all_pinned = 0;
3215
3216 if (task_running(rq, p)) {
3217 schedstat_inc(p, se.nr_failed_migrations_running);
3218 return 0;
3219 }
3220
3221 /*
3222 * Aggressive migration if:
3223 * 1) task is cache cold, or
3224 * 2) too many balance attempts have failed.
3225 */
3226
3227 tsk_cache_hot = task_hot(p, rq->clock, sd);
3228 if (!tsk_cache_hot ||
3229 sd->nr_balance_failed > sd->cache_nice_tries) {
3230#ifdef CONFIG_SCHEDSTATS
3231 if (tsk_cache_hot) {
3232 schedstat_inc(sd, lb_hot_gained[idle]);
3233 schedstat_inc(p, se.nr_forced_migrations);
3234 }
3235#endif
3236 return 1;
3237 }
3238
3239 if (tsk_cache_hot) {
3240 schedstat_inc(p, se.nr_failed_migrations_hot);
3241 return 0;
3242 }
3243 return 1;
3244}
3245
3246static unsigned long
3247balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3248 unsigned long max_load_move, struct sched_domain *sd,
3249 enum cpu_idle_type idle, int *all_pinned,
3250 int *this_best_prio, struct rq_iterator *iterator)
3251{
3252 int loops = 0, pulled = 0, pinned = 0;
3253 struct task_struct *p;
3254 long rem_load_move = max_load_move;
3255
3256 if (max_load_move == 0)
3257 goto out;
3258
3259 pinned = 1;
3260
3261 /*
3262 * Start the load-balancing iterator:
3263 */
3264 p = iterator->start(iterator->arg);
3265next:
3266 if (!p || loops++ > sysctl_sched_nr_migrate)
3267 goto out;
3268
3269 if ((p->se.load.weight >> 1) > rem_load_move ||
3270 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3271 p = iterator->next(iterator->arg);
3272 goto next;
3273 }
3274
3275 pull_task(busiest, p, this_rq, this_cpu);
3276 pulled++;
3277 rem_load_move -= p->se.load.weight;
3278
3279#ifdef CONFIG_PREEMPT
3280 /*
3281 * NEWIDLE balancing is a source of latency, so preemptible kernels
3282 * will stop after the first task is pulled to minimize the critical
3283 * section.
3284 */
3285 if (idle == CPU_NEWLY_IDLE)
3286 goto out;
3287#endif
3288
3289 /*
3290 * We only want to steal up to the prescribed amount of weighted load.
3291 */
3292 if (rem_load_move > 0) {
3293 if (p->prio < *this_best_prio)
3294 *this_best_prio = p->prio;
3295 p = iterator->next(iterator->arg);
3296 goto next;
3297 }
3298out:
3299 /*
3300 * Right now, this is one of only two places pull_task() is called,
3301 * so we can safely collect pull_task() stats here rather than
3302 * inside pull_task().
3303 */
3304 schedstat_add(sd, lb_gained[idle], pulled);
3305
3306 if (all_pinned)
3307 *all_pinned = pinned;
3308
3309 return max_load_move - rem_load_move;
3310}
3311
3312/*
3313 * move_tasks tries to move up to max_load_move weighted load from busiest to
3314 * this_rq, as part of a balancing operation within domain "sd".
3315 * Returns 1 if successful and 0 otherwise.
3316 *
3317 * Called with both runqueues locked.
3318 */
3319static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3320 unsigned long max_load_move,
3321 struct sched_domain *sd, enum cpu_idle_type idle,
3322 int *all_pinned)
3323{
3324 const struct sched_class *class = sched_class_highest;
3325 unsigned long total_load_moved = 0;
3326 int this_best_prio = this_rq->curr->prio;
3327
3328 do {
3329 total_load_moved +=
3330 class->load_balance(this_rq, this_cpu, busiest,
3331 max_load_move - total_load_moved,
3332 sd, idle, all_pinned, &this_best_prio);
3333 class = class->next;
3334
3335#ifdef CONFIG_PREEMPT
3336 /*
3337 * NEWIDLE balancing is a source of latency, so preemptible
3338 * kernels will stop after the first task is pulled to minimize
3339 * the critical section.
3340 */
3341 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3342 break;
3343#endif
3344 } while (class && max_load_move > total_load_moved);
3345
3346 return total_load_moved > 0;
3347}
3348
3349static int
3350iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3351 struct sched_domain *sd, enum cpu_idle_type idle,
3352 struct rq_iterator *iterator)
3353{
3354 struct task_struct *p = iterator->start(iterator->arg);
3355 int pinned = 0;
3356
3357 while (p) {
3358 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3359 pull_task(busiest, p, this_rq, this_cpu);
3360 /*
3361 * Right now, this is only the second place pull_task()
3362 * is called, so we can safely collect pull_task()
3363 * stats here rather than inside pull_task().
3364 */
3365 schedstat_inc(sd, lb_gained[idle]);
3366
3367 return 1;
3368 }
3369 p = iterator->next(iterator->arg);
3370 }
3371
3372 return 0;
3373}
3374
3375/*
3376 * move_one_task tries to move exactly one task from busiest to this_rq, as
3377 * part of active balancing operations within "domain".
3378 * Returns 1 if successful and 0 otherwise.
3379 *
3380 * Called with both runqueues locked.
3381 */
3382static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3383 struct sched_domain *sd, enum cpu_idle_type idle)
3384{
3385 const struct sched_class *class;
3386
3387 for_each_class(class) {
3388 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3389 return 1;
3390 }
3391
3392 return 0;
3393}
3394/********** Helpers for find_busiest_group ************************/
3395/*
3396 * sd_lb_stats - Structure to store the statistics of a sched_domain
3397 * during load balancing.
3398 */
3399struct sd_lb_stats {
3400 struct sched_group *busiest; /* Busiest group in this sd */
3401 struct sched_group *this; /* Local group in this sd */
3402 unsigned long total_load; /* Total load of all groups in sd */
3403 unsigned long total_pwr; /* Total power of all groups in sd */
3404 unsigned long avg_load; /* Average load across all groups in sd */
3405
3406 /** Statistics of this group */
3407 unsigned long this_load;
3408 unsigned long this_load_per_task;
3409 unsigned long this_nr_running;
3410
3411 /* Statistics of the busiest group */
3412 unsigned long max_load;
3413 unsigned long busiest_load_per_task;
3414 unsigned long busiest_nr_running;
3415
3416 int group_imb; /* Is there imbalance in this sd */
3417#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3418 int power_savings_balance; /* Is powersave balance needed for this sd */
3419 struct sched_group *group_min; /* Least loaded group in sd */
3420 struct sched_group *group_leader; /* Group which relieves group_min */
3421 unsigned long min_load_per_task; /* load_per_task in group_min */
3422 unsigned long leader_nr_running; /* Nr running of group_leader */
3423 unsigned long min_nr_running; /* Nr running of group_min */
3424#endif
3425};
3426
3427/*
3428 * sg_lb_stats - stats of a sched_group required for load_balancing
3429 */
3430struct sg_lb_stats {
3431 unsigned long avg_load; /*Avg load across the CPUs of the group */
3432 unsigned long group_load; /* Total load over the CPUs of the group */
3433 unsigned long sum_nr_running; /* Nr tasks running in the group */
3434 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3435 unsigned long group_capacity;
3436 int group_imb; /* Is there an imbalance in the group ? */
3437};
3438
3439/**
3440 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3441 * @group: The group whose first cpu is to be returned.
3442 */
3443static inline unsigned int group_first_cpu(struct sched_group *group)
3444{
3445 return cpumask_first(sched_group_cpus(group));
3446}
3447
3448/**
3449 * get_sd_load_idx - Obtain the load index for a given sched domain.
3450 * @sd: The sched_domain whose load_idx is to be obtained.
3451 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3452 */
3453static inline int get_sd_load_idx(struct sched_domain *sd,
3454 enum cpu_idle_type idle)
3455{
3456 int load_idx;
3457
3458 switch (idle) {
3459 case CPU_NOT_IDLE:
3460 load_idx = sd->busy_idx;
3461 break;
3462
3463 case CPU_NEWLY_IDLE:
3464 load_idx = sd->newidle_idx;
3465 break;
3466 default:
3467 load_idx = sd->idle_idx;
3468 break;
3469 }
3470
3471 return load_idx;
3472}
3473
3474
3475#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3476/**
3477 * init_sd_power_savings_stats - Initialize power savings statistics for
3478 * the given sched_domain, during load balancing.
3479 *
3480 * @sd: Sched domain whose power-savings statistics are to be initialized.
3481 * @sds: Variable containing the statistics for sd.
3482 * @idle: Idle status of the CPU at which we're performing load-balancing.
3483 */
3484static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3485 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3486{
3487 /*
3488 * Busy processors will not participate in power savings
3489 * balance.
3490 */
3491 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3492 sds->power_savings_balance = 0;
3493 else {
3494 sds->power_savings_balance = 1;
3495 sds->min_nr_running = ULONG_MAX;
3496 sds->leader_nr_running = 0;
3497 }
3498}
3499
3500/**
3501 * update_sd_power_savings_stats - Update the power saving stats for a
3502 * sched_domain while performing load balancing.
3503 *
3504 * @group: sched_group belonging to the sched_domain under consideration.
3505 * @sds: Variable containing the statistics of the sched_domain
3506 * @local_group: Does group contain the CPU for which we're performing
3507 * load balancing ?
3508 * @sgs: Variable containing the statistics of the group.
3509 */
3510static inline void update_sd_power_savings_stats(struct sched_group *group,
3511 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3512{
3513
3514 if (!sds->power_savings_balance)
3515 return;
3516
3517 /*
3518 * If the local group is idle or completely loaded
3519 * no need to do power savings balance at this domain
3520 */
3521 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3522 !sds->this_nr_running))
3523 sds->power_savings_balance = 0;
3524
3525 /*
3526 * If a group is already running at full capacity or idle,
3527 * don't include that group in power savings calculations
3528 */
3529 if (!sds->power_savings_balance ||
3530 sgs->sum_nr_running >= sgs->group_capacity ||
3531 !sgs->sum_nr_running)
3532 return;
3533
3534 /*
3535 * Calculate the group which has the least non-idle load.
3536 * This is the group from where we need to pick up the load
3537 * for saving power
3538 */
3539 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3540 (sgs->sum_nr_running == sds->min_nr_running &&
3541 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3542 sds->group_min = group;
3543 sds->min_nr_running = sgs->sum_nr_running;
3544 sds->min_load_per_task = sgs->sum_weighted_load /
3545 sgs->sum_nr_running;
3546 }
3547
3548 /*
3549 * Calculate the group which is almost near its
3550 * capacity but still has some space to pick up some load
3551 * from other group and save more power
3552 */
3553 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3554 return;
3555
3556 if (sgs->sum_nr_running > sds->leader_nr_running ||
3557 (sgs->sum_nr_running == sds->leader_nr_running &&
3558 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3559 sds->group_leader = group;
3560 sds->leader_nr_running = sgs->sum_nr_running;
3561 }
3562}
3563
3564/**
3565 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3566 * @sds: Variable containing the statistics of the sched_domain
3567 * under consideration.
3568 * @this_cpu: Cpu at which we're currently performing load-balancing.
3569 * @imbalance: Variable to store the imbalance.
3570 *
3571 * Description:
3572 * Check if we have potential to perform some power-savings balance.
3573 * If yes, set the busiest group to be the least loaded group in the
3574 * sched_domain, so that it's CPUs can be put to idle.
3575 *
3576 * Returns 1 if there is potential to perform power-savings balance.
3577 * Else returns 0.
3578 */
3579static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3580 int this_cpu, unsigned long *imbalance)
3581{
3582 if (!sds->power_savings_balance)
3583 return 0;
3584
3585 if (sds->this != sds->group_leader ||
3586 sds->group_leader == sds->group_min)
3587 return 0;
3588
3589 *imbalance = sds->min_load_per_task;
3590 sds->busiest = sds->group_min;
3591
3592 return 1;
3593
3594}
3595#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3596static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3597 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3598{
3599 return;
3600}
3601
3602static inline void update_sd_power_savings_stats(struct sched_group *group,
3603 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3604{
3605 return;
3606}
3607
3608static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3609 int this_cpu, unsigned long *imbalance)
3610{
3611 return 0;
3612}
3613#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3614
3615
3616unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3617{
3618 return SCHED_LOAD_SCALE;
3619}
3620
3621unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3622{
3623 return default_scale_freq_power(sd, cpu);
3624}
3625
3626unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3627{
3628 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3629 unsigned long smt_gain = sd->smt_gain;
3630
3631 smt_gain /= weight;
3632
3633 return smt_gain;
3634}
3635
3636unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3637{
3638 return default_scale_smt_power(sd, cpu);
3639}
3640
3641unsigned long scale_rt_power(int cpu)
3642{
3643 struct rq *rq = cpu_rq(cpu);
3644 u64 total, available;
3645
3646 sched_avg_update(rq);
3647
3648 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3649 available = total - rq->rt_avg;
3650
3651 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3652 total = SCHED_LOAD_SCALE;
3653
3654 total >>= SCHED_LOAD_SHIFT;
3655
3656 return div_u64(available, total);
3657}
3658
3659static void update_cpu_power(struct sched_domain *sd, int cpu)
3660{
3661 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3662 unsigned long power = SCHED_LOAD_SCALE;
3663 struct sched_group *sdg = sd->groups;
3664
3665 if (sched_feat(ARCH_POWER))
3666 power *= arch_scale_freq_power(sd, cpu);
3667 else
3668 power *= default_scale_freq_power(sd, cpu);
3669
3670 power >>= SCHED_LOAD_SHIFT;
3671
3672 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3673 if (sched_feat(ARCH_POWER))
3674 power *= arch_scale_smt_power(sd, cpu);
3675 else
3676 power *= default_scale_smt_power(sd, cpu);
3677
3678 power >>= SCHED_LOAD_SHIFT;
3679 }
3680
3681 power *= scale_rt_power(cpu);
3682 power >>= SCHED_LOAD_SHIFT;
3683
3684 if (!power)
3685 power = 1;
3686
3687 sdg->cpu_power = power;
3688}
3689
3690static void update_group_power(struct sched_domain *sd, int cpu)
3691{
3692 struct sched_domain *child = sd->child;
3693 struct sched_group *group, *sdg = sd->groups;
3694 unsigned long power;
3695
3696 if (!child) {
3697 update_cpu_power(sd, cpu);
3698 return;
3699 }
3700
3701 power = 0;
3702
3703 group = child->groups;
3704 do {
3705 power += group->cpu_power;
3706 group = group->next;
3707 } while (group != child->groups);
3708
3709 sdg->cpu_power = power;
3710}
3711
3712/**
3713 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3714 * @sd: The sched_domain whose statistics are to be updated.
3715 * @group: sched_group whose statistics are to be updated.
3716 * @this_cpu: Cpu for which load balance is currently performed.
3717 * @idle: Idle status of this_cpu
3718 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3719 * @sd_idle: Idle status of the sched_domain containing group.
3720 * @local_group: Does group contain this_cpu.
3721 * @cpus: Set of cpus considered for load balancing.
3722 * @balance: Should we balance.
3723 * @sgs: variable to hold the statistics for this group.
3724 */
3725static inline void update_sg_lb_stats(struct sched_domain *sd,
3726 struct sched_group *group, int this_cpu,
3727 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3728 int local_group, const struct cpumask *cpus,
3729 int *balance, struct sg_lb_stats *sgs)
3730{
3731 unsigned long load, max_cpu_load, min_cpu_load;
3732 int i;
3733 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3734 unsigned long sum_avg_load_per_task;
3735 unsigned long avg_load_per_task;
3736
3737 if (local_group) {
3738 balance_cpu = group_first_cpu(group);
3739 if (balance_cpu == this_cpu)
3740 update_group_power(sd, this_cpu);
3741 }
3742
3743 /* Tally up the load of all CPUs in the group */
3744 sum_avg_load_per_task = avg_load_per_task = 0;
3745 max_cpu_load = 0;
3746 min_cpu_load = ~0UL;
3747
3748 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3749 struct rq *rq = cpu_rq(i);
3750
3751 if (*sd_idle && rq->nr_running)
3752 *sd_idle = 0;
3753
3754 /* Bias balancing toward cpus of our domain */
3755 if (local_group) {
3756 if (idle_cpu(i) && !first_idle_cpu) {
3757 first_idle_cpu = 1;
3758 balance_cpu = i;
3759 }
3760
3761 load = target_load(i, load_idx);
3762 } else {
3763 load = source_load(i, load_idx);
3764 if (load > max_cpu_load)
3765 max_cpu_load = load;
3766 if (min_cpu_load > load)
3767 min_cpu_load = load;
3768 }
3769
3770 sgs->group_load += load;
3771 sgs->sum_nr_running += rq->nr_running;
3772 sgs->sum_weighted_load += weighted_cpuload(i);
3773
3774 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3775 }
3776
3777 /*
3778 * First idle cpu or the first cpu(busiest) in this sched group
3779 * is eligible for doing load balancing at this and above
3780 * domains. In the newly idle case, we will allow all the cpu's
3781 * to do the newly idle load balance.
3782 */
3783 if (idle != CPU_NEWLY_IDLE && local_group &&
3784 balance_cpu != this_cpu && balance) {
3785 *balance = 0;
3786 return;
3787 }
3788
3789 /* Adjust by relative CPU power of the group */
3790 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3791
3792
3793 /*
3794 * Consider the group unbalanced when the imbalance is larger
3795 * than the average weight of two tasks.
3796 *
3797 * APZ: with cgroup the avg task weight can vary wildly and
3798 * might not be a suitable number - should we keep a
3799 * normalized nr_running number somewhere that negates
3800 * the hierarchy?
3801 */
3802 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3803 group->cpu_power;
3804
3805 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3806 sgs->group_imb = 1;
3807
3808 sgs->group_capacity =
3809 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3810}
3811
3812/**
3813 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3814 * @sd: sched_domain whose statistics are to be updated.
3815 * @this_cpu: Cpu for which load balance is currently performed.
3816 * @idle: Idle status of this_cpu
3817 * @sd_idle: Idle status of the sched_domain containing group.
3818 * @cpus: Set of cpus considered for load balancing.
3819 * @balance: Should we balance.
3820 * @sds: variable to hold the statistics for this sched_domain.
3821 */
3822static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3823 enum cpu_idle_type idle, int *sd_idle,
3824 const struct cpumask *cpus, int *balance,
3825 struct sd_lb_stats *sds)
3826{
3827 struct sched_domain *child = sd->child;
3828 struct sched_group *group = sd->groups;
3829 struct sg_lb_stats sgs;
3830 int load_idx, prefer_sibling = 0;
3831
3832 if (child && child->flags & SD_PREFER_SIBLING)
3833 prefer_sibling = 1;
3834
3835 init_sd_power_savings_stats(sd, sds, idle);
3836 load_idx = get_sd_load_idx(sd, idle);
3837
3838 do {
3839 int local_group;
3840
3841 local_group = cpumask_test_cpu(this_cpu,
3842 sched_group_cpus(group));
3843 memset(&sgs, 0, sizeof(sgs));
3844 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3845 local_group, cpus, balance, &sgs);
3846
3847 if (local_group && balance && !(*balance))
3848 return;
3849
3850 sds->total_load += sgs.group_load;
3851 sds->total_pwr += group->cpu_power;
3852
3853 /*
3854 * In case the child domain prefers tasks go to siblings
3855 * first, lower the group capacity to one so that we'll try
3856 * and move all the excess tasks away.
3857 */
3858 if (prefer_sibling)
3859 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3860
3861 if (local_group) {
3862 sds->this_load = sgs.avg_load;
3863 sds->this = group;
3864 sds->this_nr_running = sgs.sum_nr_running;
3865 sds->this_load_per_task = sgs.sum_weighted_load;
3866 } else if (sgs.avg_load > sds->max_load &&
3867 (sgs.sum_nr_running > sgs.group_capacity ||
3868 sgs.group_imb)) {
3869 sds->max_load = sgs.avg_load;
3870 sds->busiest = group;
3871 sds->busiest_nr_running = sgs.sum_nr_running;
3872 sds->busiest_load_per_task = sgs.sum_weighted_load;
3873 sds->group_imb = sgs.group_imb;
3874 }
3875
3876 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3877 group = group->next;
3878 } while (group != sd->groups);
3879}
3880
3881/**
3882 * fix_small_imbalance - Calculate the minor imbalance that exists
3883 * amongst the groups of a sched_domain, during
3884 * load balancing.
3885 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3886 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3887 * @imbalance: Variable to store the imbalance.
3888 */
3889static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3890 int this_cpu, unsigned long *imbalance)
3891{
3892 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3893 unsigned int imbn = 2;
3894
3895 if (sds->this_nr_running) {
3896 sds->this_load_per_task /= sds->this_nr_running;
3897 if (sds->busiest_load_per_task >
3898 sds->this_load_per_task)
3899 imbn = 1;
3900 } else
3901 sds->this_load_per_task =
3902 cpu_avg_load_per_task(this_cpu);
3903
3904 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3905 sds->busiest_load_per_task * imbn) {
3906 *imbalance = sds->busiest_load_per_task;
3907 return;
3908 }
3909
3910 /*
3911 * OK, we don't have enough imbalance to justify moving tasks,
3912 * however we may be able to increase total CPU power used by
3913 * moving them.
3914 */
3915
3916 pwr_now += sds->busiest->cpu_power *
3917 min(sds->busiest_load_per_task, sds->max_load);
3918 pwr_now += sds->this->cpu_power *
3919 min(sds->this_load_per_task, sds->this_load);
3920 pwr_now /= SCHED_LOAD_SCALE;
3921
3922 /* Amount of load we'd subtract */
3923 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3924 sds->busiest->cpu_power;
3925 if (sds->max_load > tmp)
3926 pwr_move += sds->busiest->cpu_power *
3927 min(sds->busiest_load_per_task, sds->max_load - tmp);
3928
3929 /* Amount of load we'd add */
3930 if (sds->max_load * sds->busiest->cpu_power <
3931 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3932 tmp = (sds->max_load * sds->busiest->cpu_power) /
3933 sds->this->cpu_power;
3934 else
3935 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3936 sds->this->cpu_power;
3937 pwr_move += sds->this->cpu_power *
3938 min(sds->this_load_per_task, sds->this_load + tmp);
3939 pwr_move /= SCHED_LOAD_SCALE;
3940
3941 /* Move if we gain throughput */
3942 if (pwr_move > pwr_now)
3943 *imbalance = sds->busiest_load_per_task;
3944}
3945
3946/**
3947 * calculate_imbalance - Calculate the amount of imbalance present within the
3948 * groups of a given sched_domain during load balance.
3949 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3950 * @this_cpu: Cpu for which currently load balance is being performed.
3951 * @imbalance: The variable to store the imbalance.
3952 */
3953static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3954 unsigned long *imbalance)
3955{
3956 unsigned long max_pull;
3957 /*
3958 * In the presence of smp nice balancing, certain scenarios can have
3959 * max load less than avg load(as we skip the groups at or below
3960 * its cpu_power, while calculating max_load..)
3961 */
3962 if (sds->max_load < sds->avg_load) {
3963 *imbalance = 0;
3964 return fix_small_imbalance(sds, this_cpu, imbalance);
3965 }
3966
3967 /* Don't want to pull so many tasks that a group would go idle */
3968 max_pull = min(sds->max_load - sds->avg_load,
3969 sds->max_load - sds->busiest_load_per_task);
3970
3971 /* How much load to actually move to equalise the imbalance */
3972 *imbalance = min(max_pull * sds->busiest->cpu_power,
3973 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3974 / SCHED_LOAD_SCALE;
3975
3976 /*
3977 * if *imbalance is less than the average load per runnable task
3978 * there is no gaurantee that any tasks will be moved so we'll have
3979 * a think about bumping its value to force at least one task to be
3980 * moved
3981 */
3982 if (*imbalance < sds->busiest_load_per_task)
3983 return fix_small_imbalance(sds, this_cpu, imbalance);
3984
3985}
3986/******* find_busiest_group() helpers end here *********************/
3987
3988/**
3989 * find_busiest_group - Returns the busiest group within the sched_domain
3990 * if there is an imbalance. If there isn't an imbalance, and
3991 * the user has opted for power-savings, it returns a group whose
3992 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3993 * such a group exists.
3994 *
3995 * Also calculates the amount of weighted load which should be moved
3996 * to restore balance.
3997 *
3998 * @sd: The sched_domain whose busiest group is to be returned.
3999 * @this_cpu: The cpu for which load balancing is currently being performed.
4000 * @imbalance: Variable which stores amount of weighted load which should
4001 * be moved to restore balance/put a group to idle.
4002 * @idle: The idle status of this_cpu.
4003 * @sd_idle: The idleness of sd
4004 * @cpus: The set of CPUs under consideration for load-balancing.
4005 * @balance: Pointer to a variable indicating if this_cpu
4006 * is the appropriate cpu to perform load balancing at this_level.
4007 *
4008 * Returns: - the busiest group if imbalance exists.
4009 * - If no imbalance and user has opted for power-savings balance,
4010 * return the least loaded group whose CPUs can be
4011 * put to idle by rebalancing its tasks onto our group.
4012 */
4013static struct sched_group *
4014find_busiest_group(struct sched_domain *sd, int this_cpu,
4015 unsigned long *imbalance, enum cpu_idle_type idle,
4016 int *sd_idle, const struct cpumask *cpus, int *balance)
4017{
4018 struct sd_lb_stats sds;
4019
4020 memset(&sds, 0, sizeof(sds));
4021
4022 /*
4023 * Compute the various statistics relavent for load balancing at
4024 * this level.
4025 */
4026 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4027 balance, &sds);
4028
4029 /* Cases where imbalance does not exist from POV of this_cpu */
4030 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4031 * at this level.
4032 * 2) There is no busy sibling group to pull from.
4033 * 3) This group is the busiest group.
4034 * 4) This group is more busy than the avg busieness at this
4035 * sched_domain.
4036 * 5) The imbalance is within the specified limit.
4037 * 6) Any rebalance would lead to ping-pong
4038 */
4039 if (balance && !(*balance))
4040 goto ret;
4041
4042 if (!sds.busiest || sds.busiest_nr_running == 0)
4043 goto out_balanced;
4044
4045 if (sds.this_load >= sds.max_load)
4046 goto out_balanced;
4047
4048 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4049
4050 if (sds.this_load >= sds.avg_load)
4051 goto out_balanced;
4052
4053 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4054 goto out_balanced;
4055
4056 sds.busiest_load_per_task /= sds.busiest_nr_running;
4057 if (sds.group_imb)
4058 sds.busiest_load_per_task =
4059 min(sds.busiest_load_per_task, sds.avg_load);
4060
4061 /*
4062 * We're trying to get all the cpus to the average_load, so we don't
4063 * want to push ourselves above the average load, nor do we wish to
4064 * reduce the max loaded cpu below the average load, as either of these
4065 * actions would just result in more rebalancing later, and ping-pong
4066 * tasks around. Thus we look for the minimum possible imbalance.
4067 * Negative imbalances (*we* are more loaded than anyone else) will
4068 * be counted as no imbalance for these purposes -- we can't fix that
4069 * by pulling tasks to us. Be careful of negative numbers as they'll
4070 * appear as very large values with unsigned longs.
4071 */
4072 if (sds.max_load <= sds.busiest_load_per_task)
4073 goto out_balanced;
4074
4075 /* Looks like there is an imbalance. Compute it */
4076 calculate_imbalance(&sds, this_cpu, imbalance);
4077 return sds.busiest;
4078
4079out_balanced:
4080 /*
4081 * There is no obvious imbalance. But check if we can do some balancing
4082 * to save power.
4083 */
4084 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4085 return sds.busiest;
4086ret:
4087 *imbalance = 0;
4088 return NULL;
4089}
4090
4091/*
4092 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4093 */
4094static struct rq *
4095find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4096 unsigned long imbalance, const struct cpumask *cpus)
4097{
4098 struct rq *busiest = NULL, *rq;
4099 unsigned long max_load = 0;
4100 int i;
4101
4102 for_each_cpu(i, sched_group_cpus(group)) {
4103 unsigned long power = power_of(i);
4104 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4105 unsigned long wl;
4106
4107 if (!cpumask_test_cpu(i, cpus))
4108 continue;
4109
4110 rq = cpu_rq(i);
4111 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4112 wl /= power;
4113
4114 if (capacity && rq->nr_running == 1 && wl > imbalance)
4115 continue;
4116
4117 if (wl > max_load) {
4118 max_load = wl;
4119 busiest = rq;
4120 }
4121 }
4122
4123 return busiest;
4124}
4125
4126/*
4127 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4128 * so long as it is large enough.
4129 */
4130#define MAX_PINNED_INTERVAL 512
4131
4132/* Working cpumask for load_balance and load_balance_newidle. */
4133static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4134
4135/*
4136 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4137 * tasks if there is an imbalance.
4138 */
4139static int load_balance(int this_cpu, struct rq *this_rq,
4140 struct sched_domain *sd, enum cpu_idle_type idle,
4141 int *balance)
4142{
4143 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4144 struct sched_group *group;
4145 unsigned long imbalance;
4146 struct rq *busiest;
4147 unsigned long flags;
4148 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4149
4150 cpumask_copy(cpus, cpu_active_mask);
4151
4152 /*
4153 * When power savings policy is enabled for the parent domain, idle
4154 * sibling can pick up load irrespective of busy siblings. In this case,
4155 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4156 * portraying it as CPU_NOT_IDLE.
4157 */
4158 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4159 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4160 sd_idle = 1;
4161
4162 schedstat_inc(sd, lb_count[idle]);
4163
4164redo:
4165 update_shares(sd);
4166 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4167 cpus, balance);
4168
4169 if (*balance == 0)
4170 goto out_balanced;
4171
4172 if (!group) {
4173 schedstat_inc(sd, lb_nobusyg[idle]);
4174 goto out_balanced;
4175 }
4176
4177 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4178 if (!busiest) {
4179 schedstat_inc(sd, lb_nobusyq[idle]);
4180 goto out_balanced;
4181 }
4182
4183 BUG_ON(busiest == this_rq);
4184
4185 schedstat_add(sd, lb_imbalance[idle], imbalance);
4186
4187 ld_moved = 0;
4188 if (busiest->nr_running > 1) {
4189 /*
4190 * Attempt to move tasks. If find_busiest_group has found
4191 * an imbalance but busiest->nr_running <= 1, the group is
4192 * still unbalanced. ld_moved simply stays zero, so it is
4193 * correctly treated as an imbalance.
4194 */
4195 local_irq_save(flags);
4196 double_rq_lock(this_rq, busiest);
4197 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4198 imbalance, sd, idle, &all_pinned);
4199 double_rq_unlock(this_rq, busiest);
4200 local_irq_restore(flags);
4201
4202 /*
4203 * some other cpu did the load balance for us.
4204 */
4205 if (ld_moved && this_cpu != smp_processor_id())
4206 resched_cpu(this_cpu);
4207
4208 /* All tasks on this runqueue were pinned by CPU affinity */
4209 if (unlikely(all_pinned)) {
4210 cpumask_clear_cpu(cpu_of(busiest), cpus);
4211 if (!cpumask_empty(cpus))
4212 goto redo;
4213 goto out_balanced;
4214 }
4215 }
4216
4217 if (!ld_moved) {
4218 schedstat_inc(sd, lb_failed[idle]);
4219 sd->nr_balance_failed++;
4220
4221 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4222
4223 raw_spin_lock_irqsave(&busiest->lock, flags);
4224
4225 /* don't kick the migration_thread, if the curr
4226 * task on busiest cpu can't be moved to this_cpu
4227 */
4228 if (!cpumask_test_cpu(this_cpu,
4229 &busiest->curr->cpus_allowed)) {
4230 raw_spin_unlock_irqrestore(&busiest->lock,
4231 flags);
4232 all_pinned = 1;
4233 goto out_one_pinned;
4234 }
4235
4236 if (!busiest->active_balance) {
4237 busiest->active_balance = 1;
4238 busiest->push_cpu = this_cpu;
4239 active_balance = 1;
4240 }
4241 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4242 if (active_balance)
4243 wake_up_process(busiest->migration_thread);
4244
4245 /*
4246 * We've kicked active balancing, reset the failure
4247 * counter.
4248 */
4249 sd->nr_balance_failed = sd->cache_nice_tries+1;
4250 }
4251 } else
4252 sd->nr_balance_failed = 0;
4253
4254 if (likely(!active_balance)) {
4255 /* We were unbalanced, so reset the balancing interval */
4256 sd->balance_interval = sd->min_interval;
4257 } else {
4258 /*
4259 * If we've begun active balancing, start to back off. This
4260 * case may not be covered by the all_pinned logic if there
4261 * is only 1 task on the busy runqueue (because we don't call
4262 * move_tasks).
4263 */
4264 if (sd->balance_interval < sd->max_interval)
4265 sd->balance_interval *= 2;
4266 }
4267
4268 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4269 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4270 ld_moved = -1;
4271
4272 goto out;
4273
4274out_balanced:
4275 schedstat_inc(sd, lb_balanced[idle]);
4276
4277 sd->nr_balance_failed = 0;
4278
4279out_one_pinned:
4280 /* tune up the balancing interval */
4281 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4282 (sd->balance_interval < sd->max_interval))
4283 sd->balance_interval *= 2;
4284
4285 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4286 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4287 ld_moved = -1;
4288 else
4289 ld_moved = 0;
4290out:
4291 if (ld_moved)
4292 update_shares(sd);
4293 return ld_moved;
4294}
4295
4296/*
4297 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4298 * tasks if there is an imbalance.
4299 *
4300 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4301 * this_rq is locked.
4302 */
4303static int
4304load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4305{
4306 struct sched_group *group;
4307 struct rq *busiest = NULL;
4308 unsigned long imbalance;
4309 int ld_moved = 0;
4310 int sd_idle = 0;
4311 int all_pinned = 0;
4312 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4313
4314 cpumask_copy(cpus, cpu_active_mask);
4315
4316 /*
4317 * When power savings policy is enabled for the parent domain, idle
4318 * sibling can pick up load irrespective of busy siblings. In this case,
4319 * let the state of idle sibling percolate up as IDLE, instead of
4320 * portraying it as CPU_NOT_IDLE.
4321 */
4322 if (sd->flags & SD_SHARE_CPUPOWER &&
4323 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4324 sd_idle = 1;
4325
4326 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4327redo:
4328 update_shares_locked(this_rq, sd);
4329 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4330 &sd_idle, cpus, NULL);
4331 if (!group) {
4332 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4333 goto out_balanced;
4334 }
4335
4336 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4337 if (!busiest) {
4338 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4339 goto out_balanced;
4340 }
4341
4342 BUG_ON(busiest == this_rq);
4343
4344 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4345
4346 ld_moved = 0;
4347 if (busiest->nr_running > 1) {
4348 /* Attempt to move tasks */
4349 double_lock_balance(this_rq, busiest);
4350 /* this_rq->clock is already updated */
4351 update_rq_clock(busiest);
4352 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4353 imbalance, sd, CPU_NEWLY_IDLE,
4354 &all_pinned);
4355 double_unlock_balance(this_rq, busiest);
4356
4357 if (unlikely(all_pinned)) {
4358 cpumask_clear_cpu(cpu_of(busiest), cpus);
4359 if (!cpumask_empty(cpus))
4360 goto redo;
4361 }
4362 }
4363
4364 if (!ld_moved) {
4365 int active_balance = 0;
4366
4367 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4368 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4369 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4370 return -1;
4371
4372 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4373 return -1;
4374
4375 if (sd->nr_balance_failed++ < 2)
4376 return -1;
4377
4378 /*
4379 * The only task running in a non-idle cpu can be moved to this
4380 * cpu in an attempt to completely freeup the other CPU
4381 * package. The same method used to move task in load_balance()
4382 * have been extended for load_balance_newidle() to speedup
4383 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4384 *
4385 * The package power saving logic comes from
4386 * find_busiest_group(). If there are no imbalance, then
4387 * f_b_g() will return NULL. However when sched_mc={1,2} then
4388 * f_b_g() will select a group from which a running task may be
4389 * pulled to this cpu in order to make the other package idle.
4390 * If there is no opportunity to make a package idle and if
4391 * there are no imbalance, then f_b_g() will return NULL and no
4392 * action will be taken in load_balance_newidle().
4393 *
4394 * Under normal task pull operation due to imbalance, there
4395 * will be more than one task in the source run queue and
4396 * move_tasks() will succeed. ld_moved will be true and this
4397 * active balance code will not be triggered.
4398 */
4399
4400 /* Lock busiest in correct order while this_rq is held */
4401 double_lock_balance(this_rq, busiest);
4402
4403 /*
4404 * don't kick the migration_thread, if the curr
4405 * task on busiest cpu can't be moved to this_cpu
4406 */
4407 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4408 double_unlock_balance(this_rq, busiest);
4409 all_pinned = 1;
4410 return ld_moved;
4411 }
4412
4413 if (!busiest->active_balance) {
4414 busiest->active_balance = 1;
4415 busiest->push_cpu = this_cpu;
4416 active_balance = 1;
4417 }
4418
4419 double_unlock_balance(this_rq, busiest);
4420 /*
4421 * Should not call ttwu while holding a rq->lock
4422 */
4423 raw_spin_unlock(&this_rq->lock);
4424 if (active_balance)
4425 wake_up_process(busiest->migration_thread);
4426 raw_spin_lock(&this_rq->lock);
4427
4428 } else
4429 sd->nr_balance_failed = 0;
4430
4431 update_shares_locked(this_rq, sd);
4432 return ld_moved;
4433
4434out_balanced:
4435 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4436 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4437 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4438 return -1;
4439 sd->nr_balance_failed = 0;
4440
4441 return 0;
4442}
4443
4444/*
4445 * idle_balance is called by schedule() if this_cpu is about to become
4446 * idle. Attempts to pull tasks from other CPUs.
4447 */
4448static void idle_balance(int this_cpu, struct rq *this_rq)
4449{
4450 struct sched_domain *sd;
4451 int pulled_task = 0;
4452 unsigned long next_balance = jiffies + HZ;
4453
4454 this_rq->idle_stamp = this_rq->clock;
4455
4456 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4457 return;
4458
4459 for_each_domain(this_cpu, sd) {
4460 unsigned long interval;
4461
4462 if (!(sd->flags & SD_LOAD_BALANCE))
4463 continue;
4464
4465 if (sd->flags & SD_BALANCE_NEWIDLE)
4466 /* If we've pulled tasks over stop searching: */
4467 pulled_task = load_balance_newidle(this_cpu, this_rq,
4468 sd);
4469
4470 interval = msecs_to_jiffies(sd->balance_interval);
4471 if (time_after(next_balance, sd->last_balance + interval))
4472 next_balance = sd->last_balance + interval;
4473 if (pulled_task) {
4474 this_rq->idle_stamp = 0;
4475 break;
4476 }
4477 }
4478 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4479 /*
4480 * We are going idle. next_balance may be set based on
4481 * a busy processor. So reset next_balance.
4482 */
4483 this_rq->next_balance = next_balance;
4484 }
4485}
4486
4487/*
4488 * active_load_balance is run by migration threads. It pushes running tasks
4489 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4490 * running on each physical CPU where possible, and avoids physical /
4491 * logical imbalances.
4492 *
4493 * Called with busiest_rq locked.
4494 */
4495static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4496{
4497 int target_cpu = busiest_rq->push_cpu;
4498 struct sched_domain *sd;
4499 struct rq *target_rq;
4500
4501 /* Is there any task to move? */
4502 if (busiest_rq->nr_running <= 1)
4503 return;
4504
4505 target_rq = cpu_rq(target_cpu);
4506
4507 /*
4508 * This condition is "impossible", if it occurs
4509 * we need to fix it. Originally reported by
4510 * Bjorn Helgaas on a 128-cpu setup.
4511 */
4512 BUG_ON(busiest_rq == target_rq);
4513
4514 /* move a task from busiest_rq to target_rq */
4515 double_lock_balance(busiest_rq, target_rq);
4516 update_rq_clock(busiest_rq);
4517 update_rq_clock(target_rq);
4518
4519 /* Search for an sd spanning us and the target CPU. */
4520 for_each_domain(target_cpu, sd) {
4521 if ((sd->flags & SD_LOAD_BALANCE) &&
4522 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4523 break;
4524 }
4525
4526 if (likely(sd)) {
4527 schedstat_inc(sd, alb_count);
4528
4529 if (move_one_task(target_rq, target_cpu, busiest_rq,
4530 sd, CPU_IDLE))
4531 schedstat_inc(sd, alb_pushed);
4532 else
4533 schedstat_inc(sd, alb_failed);
4534 }
4535 double_unlock_balance(busiest_rq, target_rq);
4536}
4537
4538#ifdef CONFIG_NO_HZ
4539static struct {
4540 atomic_t load_balancer;
4541 cpumask_var_t cpu_mask;
4542 cpumask_var_t ilb_grp_nohz_mask;
4543} nohz ____cacheline_aligned = {
4544 .load_balancer = ATOMIC_INIT(-1),
4545};
4546
4547int get_nohz_load_balancer(void)
4548{
4549 return atomic_read(&nohz.load_balancer);
4550}
4551
4552#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4553/**
4554 * lowest_flag_domain - Return lowest sched_domain containing flag.
4555 * @cpu: The cpu whose lowest level of sched domain is to
4556 * be returned.
4557 * @flag: The flag to check for the lowest sched_domain
4558 * for the given cpu.
4559 *
4560 * Returns the lowest sched_domain of a cpu which contains the given flag.
4561 */
4562static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4563{
4564 struct sched_domain *sd;
4565
4566 for_each_domain(cpu, sd)
4567 if (sd && (sd->flags & flag))
4568 break;
4569
4570 return sd;
4571}
4572
4573/**
4574 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4575 * @cpu: The cpu whose domains we're iterating over.
4576 * @sd: variable holding the value of the power_savings_sd
4577 * for cpu.
4578 * @flag: The flag to filter the sched_domains to be iterated.
4579 *
4580 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4581 * set, starting from the lowest sched_domain to the highest.
4582 */
4583#define for_each_flag_domain(cpu, sd, flag) \
4584 for (sd = lowest_flag_domain(cpu, flag); \
4585 (sd && (sd->flags & flag)); sd = sd->parent)
4586
4587/**
4588 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4589 * @ilb_group: group to be checked for semi-idleness
4590 *
4591 * Returns: 1 if the group is semi-idle. 0 otherwise.
4592 *
4593 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4594 * and atleast one non-idle CPU. This helper function checks if the given
4595 * sched_group is semi-idle or not.
4596 */
4597static inline int is_semi_idle_group(struct sched_group *ilb_group)
4598{
4599 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4600 sched_group_cpus(ilb_group));
4601
4602 /*
4603 * A sched_group is semi-idle when it has atleast one busy cpu
4604 * and atleast one idle cpu.
4605 */
4606 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4607 return 0;
4608
4609 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4610 return 0;
4611
4612 return 1;
4613}
4614/**
4615 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4616 * @cpu: The cpu which is nominating a new idle_load_balancer.
4617 *
4618 * Returns: Returns the id of the idle load balancer if it exists,
4619 * Else, returns >= nr_cpu_ids.
4620 *
4621 * This algorithm picks the idle load balancer such that it belongs to a
4622 * semi-idle powersavings sched_domain. The idea is to try and avoid
4623 * completely idle packages/cores just for the purpose of idle load balancing
4624 * when there are other idle cpu's which are better suited for that job.
4625 */
4626static int find_new_ilb(int cpu)
4627{
4628 struct sched_domain *sd;
4629 struct sched_group *ilb_group;
4630
4631 /*
4632 * Have idle load balancer selection from semi-idle packages only
4633 * when power-aware load balancing is enabled
4634 */
4635 if (!(sched_smt_power_savings || sched_mc_power_savings))
4636 goto out_done;
4637
4638 /*
4639 * Optimize for the case when we have no idle CPUs or only one
4640 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4641 */
4642 if (cpumask_weight(nohz.cpu_mask) < 2)
4643 goto out_done;
4644
4645 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4646 ilb_group = sd->groups;
4647
4648 do {
4649 if (is_semi_idle_group(ilb_group))
4650 return cpumask_first(nohz.ilb_grp_nohz_mask);
4651
4652 ilb_group = ilb_group->next;
4653
4654 } while (ilb_group != sd->groups);
4655 }
4656
4657out_done:
4658 return cpumask_first(nohz.cpu_mask);
4659}
4660#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4661static inline int find_new_ilb(int call_cpu)
4662{
4663 return cpumask_first(nohz.cpu_mask);
4664}
4665#endif
4666
4667/*
4668 * This routine will try to nominate the ilb (idle load balancing)
4669 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4670 * load balancing on behalf of all those cpus. If all the cpus in the system
4671 * go into this tickless mode, then there will be no ilb owner (as there is
4672 * no need for one) and all the cpus will sleep till the next wakeup event
4673 * arrives...
4674 *
4675 * For the ilb owner, tick is not stopped. And this tick will be used
4676 * for idle load balancing. ilb owner will still be part of
4677 * nohz.cpu_mask..
4678 *
4679 * While stopping the tick, this cpu will become the ilb owner if there
4680 * is no other owner. And will be the owner till that cpu becomes busy
4681 * or if all cpus in the system stop their ticks at which point
4682 * there is no need for ilb owner.
4683 *
4684 * When the ilb owner becomes busy, it nominates another owner, during the
4685 * next busy scheduler_tick()
4686 */
4687int select_nohz_load_balancer(int stop_tick)
4688{
4689 int cpu = smp_processor_id();
4690
4691 if (stop_tick) {
4692 cpu_rq(cpu)->in_nohz_recently = 1;
4693
4694 if (!cpu_active(cpu)) {
4695 if (atomic_read(&nohz.load_balancer) != cpu)
4696 return 0;
4697
4698 /*
4699 * If we are going offline and still the leader,
4700 * give up!
4701 */
4702 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4703 BUG();
4704
4705 return 0;
4706 }
4707
4708 cpumask_set_cpu(cpu, nohz.cpu_mask);
4709
4710 /* time for ilb owner also to sleep */
4711 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4712 if (atomic_read(&nohz.load_balancer) == cpu)
4713 atomic_set(&nohz.load_balancer, -1);
4714 return 0;
4715 }
4716
4717 if (atomic_read(&nohz.load_balancer) == -1) {
4718 /* make me the ilb owner */
4719 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4720 return 1;
4721 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4722 int new_ilb;
4723
4724 if (!(sched_smt_power_savings ||
4725 sched_mc_power_savings))
4726 return 1;
4727 /*
4728 * Check to see if there is a more power-efficient
4729 * ilb.
4730 */
4731 new_ilb = find_new_ilb(cpu);
4732 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4733 atomic_set(&nohz.load_balancer, -1);
4734 resched_cpu(new_ilb);
4735 return 0;
4736 }
4737 return 1;
4738 }
4739 } else {
4740 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4741 return 0;
4742
4743 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4744
4745 if (atomic_read(&nohz.load_balancer) == cpu)
4746 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4747 BUG();
4748 }
4749 return 0;
4750}
4751#endif
4752
4753static DEFINE_SPINLOCK(balancing);
4754
4755/*
4756 * It checks each scheduling domain to see if it is due to be balanced,
4757 * and initiates a balancing operation if so.
4758 *
4759 * Balancing parameters are set up in arch_init_sched_domains.
4760 */
4761static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4762{
4763 int balance = 1;
4764 struct rq *rq = cpu_rq(cpu);
4765 unsigned long interval;
4766 struct sched_domain *sd;
4767 /* Earliest time when we have to do rebalance again */
4768 unsigned long next_balance = jiffies + 60*HZ;
4769 int update_next_balance = 0;
4770 int need_serialize;
4771
4772 for_each_domain(cpu, sd) {
4773 if (!(sd->flags & SD_LOAD_BALANCE))
4774 continue;
4775
4776 interval = sd->balance_interval;
4777 if (idle != CPU_IDLE)
4778 interval *= sd->busy_factor;
4779
4780 /* scale ms to jiffies */
4781 interval = msecs_to_jiffies(interval);
4782 if (unlikely(!interval))
4783 interval = 1;
4784 if (interval > HZ*NR_CPUS/10)
4785 interval = HZ*NR_CPUS/10;
4786
4787 need_serialize = sd->flags & SD_SERIALIZE;
4788
4789 if (need_serialize) {
4790 if (!spin_trylock(&balancing))
4791 goto out;
4792 }
4793
4794 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4795 if (load_balance(cpu, rq, sd, idle, &balance)) {
4796 /*
4797 * We've pulled tasks over so either we're no
4798 * longer idle, or one of our SMT siblings is
4799 * not idle.
4800 */
4801 idle = CPU_NOT_IDLE;
4802 }
4803 sd->last_balance = jiffies;
4804 }
4805 if (need_serialize)
4806 spin_unlock(&balancing);
4807out:
4808 if (time_after(next_balance, sd->last_balance + interval)) {
4809 next_balance = sd->last_balance + interval;
4810 update_next_balance = 1;
4811 }
4812
4813 /*
4814 * Stop the load balance at this level. There is another
4815 * CPU in our sched group which is doing load balancing more
4816 * actively.
4817 */
4818 if (!balance)
4819 break;
4820 }
4821
4822 /*
4823 * next_balance will be updated only when there is a need.
4824 * When the cpu is attached to null domain for ex, it will not be
4825 * updated.
4826 */
4827 if (likely(update_next_balance))
4828 rq->next_balance = next_balance;
4829}
4830
4831/*
4832 * run_rebalance_domains is triggered when needed from the scheduler tick.
4833 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4834 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4835 */
4836static void run_rebalance_domains(struct softirq_action *h)
4837{
4838 int this_cpu = smp_processor_id();
4839 struct rq *this_rq = cpu_rq(this_cpu);
4840 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4841 CPU_IDLE : CPU_NOT_IDLE;
4842
4843 rebalance_domains(this_cpu, idle);
4844
4845#ifdef CONFIG_NO_HZ
4846 /*
4847 * If this cpu is the owner for idle load balancing, then do the
4848 * balancing on behalf of the other idle cpus whose ticks are
4849 * stopped.
4850 */
4851 if (this_rq->idle_at_tick &&
4852 atomic_read(&nohz.load_balancer) == this_cpu) {
4853 struct rq *rq;
4854 int balance_cpu;
4855
4856 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4857 if (balance_cpu == this_cpu)
4858 continue;
4859
4860 /*
4861 * If this cpu gets work to do, stop the load balancing
4862 * work being done for other cpus. Next load
4863 * balancing owner will pick it up.
4864 */
4865 if (need_resched())
4866 break;
4867
4868 rebalance_domains(balance_cpu, CPU_IDLE);
4869
4870 rq = cpu_rq(balance_cpu);
4871 if (time_after(this_rq->next_balance, rq->next_balance))
4872 this_rq->next_balance = rq->next_balance;
4873 }
4874 }
4875#endif
4876}
4877
4878static inline int on_null_domain(int cpu)
4879{
4880 return !rcu_dereference(cpu_rq(cpu)->sd);
4881}
4882
4883/*
4884 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4885 *
4886 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4887 * idle load balancing owner or decide to stop the periodic load balancing,
4888 * if the whole system is idle.
4889 */
4890static inline void trigger_load_balance(struct rq *rq, int cpu)
4891{
4892#ifdef CONFIG_NO_HZ
4893 /*
4894 * If we were in the nohz mode recently and busy at the current
4895 * scheduler tick, then check if we need to nominate new idle
4896 * load balancer.
4897 */
4898 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4899 rq->in_nohz_recently = 0;
4900
4901 if (atomic_read(&nohz.load_balancer) == cpu) {
4902 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4903 atomic_set(&nohz.load_balancer, -1);
4904 }
4905
4906 if (atomic_read(&nohz.load_balancer) == -1) {
4907 int ilb = find_new_ilb(cpu);
4908
4909 if (ilb < nr_cpu_ids)
4910 resched_cpu(ilb);
4911 }
4912 }
4913
4914 /*
4915 * If this cpu is idle and doing idle load balancing for all the
4916 * cpus with ticks stopped, is it time for that to stop?
4917 */
4918 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4919 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4920 resched_cpu(cpu);
4921 return;
4922 }
4923
4924 /*
4925 * If this cpu is idle and the idle load balancing is done by
4926 * someone else, then no need raise the SCHED_SOFTIRQ
4927 */
4928 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4929 cpumask_test_cpu(cpu, nohz.cpu_mask))
4930 return;
4931#endif
4932 /* Don't need to rebalance while attached to NULL domain */
4933 if (time_after_eq(jiffies, rq->next_balance) &&
4934 likely(!on_null_domain(cpu)))
4935 raise_softirq(SCHED_SOFTIRQ);
4936}
4937
4938#else /* CONFIG_SMP */
4939
4940/*
4941 * on UP we do not need to balance between CPUs:
4942 */
4943static inline void idle_balance(int cpu, struct rq *rq)
4944{
4945}
4946
4947#endif 3079#endif
4948 3080
4949DEFINE_PER_CPU(struct kernel_stat, kstat); 3081DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5298,7 +3430,7 @@ void scheduler_tick(void)
5298 curr->sched_class->task_tick(rq, curr, 0); 3430 curr->sched_class->task_tick(rq, curr, 0);
5299 raw_spin_unlock(&rq->lock); 3431 raw_spin_unlock(&rq->lock);
5300 3432
5301 perf_event_task_tick(curr, cpu); 3433 perf_event_task_tick(curr);
5302 3434
5303#ifdef CONFIG_SMP 3435#ifdef CONFIG_SMP
5304 rq->idle_at_tick = idle_cpu(cpu); 3436 rq->idle_at_tick = idle_cpu(cpu);
@@ -5412,23 +3544,9 @@ static inline void schedule_debug(struct task_struct *prev)
5412 3544
5413static void put_prev_task(struct rq *rq, struct task_struct *prev) 3545static void put_prev_task(struct rq *rq, struct task_struct *prev)
5414{ 3546{
5415 if (prev->state == TASK_RUNNING) { 3547 if (prev->se.on_rq)
5416 u64 runtime = prev->se.sum_exec_runtime; 3548 update_rq_clock(rq);
5417 3549 rq->skip_clock_update = 0;
5418 runtime -= prev->se.prev_sum_exec_runtime;
5419 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5420
5421 /*
5422 * In order to avoid avg_overlap growing stale when we are
5423 * indeed overlapping and hence not getting put to sleep, grow
5424 * the avg_overlap on preemption.
5425 *
5426 * We use the average preemption runtime because that
5427 * correlates to the amount of cache footprint a task can
5428 * build up.
5429 */
5430 update_avg(&prev->se.avg_overlap, runtime);
5431 }
5432 prev->sched_class->put_prev_task(rq, prev); 3550 prev->sched_class->put_prev_task(rq, prev);
5433} 3551}
5434 3552
@@ -5478,7 +3596,7 @@ need_resched:
5478 preempt_disable(); 3596 preempt_disable();
5479 cpu = smp_processor_id(); 3597 cpu = smp_processor_id();
5480 rq = cpu_rq(cpu); 3598 rq = cpu_rq(cpu);
5481 rcu_sched_qs(cpu); 3599 rcu_note_context_switch(cpu);
5482 prev = rq->curr; 3600 prev = rq->curr;
5483 switch_count = &prev->nivcsw; 3601 switch_count = &prev->nivcsw;
5484 3602
@@ -5491,14 +3609,13 @@ need_resched_nonpreemptible:
5491 hrtick_clear(rq); 3609 hrtick_clear(rq);
5492 3610
5493 raw_spin_lock_irq(&rq->lock); 3611 raw_spin_lock_irq(&rq->lock);
5494 update_rq_clock(rq);
5495 clear_tsk_need_resched(prev); 3612 clear_tsk_need_resched(prev);
5496 3613
5497 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5498 if (unlikely(signal_pending_state(prev->state, prev))) 3615 if (unlikely(signal_pending_state(prev->state, prev)))
5499 prev->state = TASK_RUNNING; 3616 prev->state = TASK_RUNNING;
5500 else 3617 else
5501 deactivate_task(rq, prev, 1); 3618 deactivate_task(rq, prev, DEQUEUE_SLEEP);
5502 switch_count = &prev->nvcsw; 3619 switch_count = &prev->nvcsw;
5503 } 3620 }
5504 3621
@@ -5512,7 +3629,7 @@ need_resched_nonpreemptible:
5512 3629
5513 if (likely(prev != next)) { 3630 if (likely(prev != next)) {
5514 sched_info_switch(prev, next); 3631 sched_info_switch(prev, next);
5515 perf_event_task_sched_out(prev, next, cpu); 3632 perf_event_task_sched_out(prev, next);
5516 3633
5517 rq->nr_switches++; 3634 rq->nr_switches++;
5518 rq->curr = next; 3635 rq->curr = next;
@@ -5562,7 +3679,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5562 * the mutex owner just released it and exited. 3679 * the mutex owner just released it and exited.
5563 */ 3680 */
5564 if (probe_kernel_address(&owner->cpu, cpu)) 3681 if (probe_kernel_address(&owner->cpu, cpu))
5565 goto out; 3682 return 0;
5566#else 3683#else
5567 cpu = owner->cpu; 3684 cpu = owner->cpu;
5568#endif 3685#endif
@@ -5572,14 +3689,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5572 * the cpu field may no longer be valid. 3689 * the cpu field may no longer be valid.
5573 */ 3690 */
5574 if (cpu >= nr_cpumask_bits) 3691 if (cpu >= nr_cpumask_bits)
5575 goto out; 3692 return 0;
5576 3693
5577 /* 3694 /*
5578 * We need to validate that we can do a 3695 * We need to validate that we can do a
5579 * get_cpu() and that we have the percpu area. 3696 * get_cpu() and that we have the percpu area.
5580 */ 3697 */
5581 if (!cpu_online(cpu)) 3698 if (!cpu_online(cpu))
5582 goto out; 3699 return 0;
5583 3700
5584 rq = cpu_rq(cpu); 3701 rq = cpu_rq(cpu);
5585 3702
@@ -5598,7 +3715,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5598 3715
5599 cpu_relax(); 3716 cpu_relax();
5600 } 3717 }
5601out: 3718
5602 return 1; 3719 return 1;
5603} 3720}
5604#endif 3721#endif
@@ -5722,6 +3839,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5722{ 3839{
5723 __wake_up_common(q, mode, 1, 0, NULL); 3840 __wake_up_common(q, mode, 1, 0, NULL);
5724} 3841}
3842EXPORT_SYMBOL_GPL(__wake_up_locked);
5725 3843
5726void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3844void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5727{ 3845{
@@ -5821,8 +3939,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
5821 if (!x->done) { 3939 if (!x->done) {
5822 DECLARE_WAITQUEUE(wait, current); 3940 DECLARE_WAITQUEUE(wait, current);
5823 3941
5824 wait.flags |= WQ_FLAG_EXCLUSIVE; 3942 __add_wait_queue_tail_exclusive(&x->wait, &wait);
5825 __add_wait_queue_tail(&x->wait, &wait);
5826 do { 3943 do {
5827 if (signal_pending_state(state, current)) { 3944 if (signal_pending_state(state, current)) {
5828 timeout = -ERESTARTSYS; 3945 timeout = -ERESTARTSYS;
@@ -5933,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x)
5933EXPORT_SYMBOL(wait_for_completion_killable); 4050EXPORT_SYMBOL(wait_for_completion_killable);
5934 4051
5935/** 4052/**
4053 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4054 * @x: holds the state of this particular completion
4055 * @timeout: timeout value in jiffies
4056 *
4057 * This waits for either a completion of a specific task to be
4058 * signaled or for a specified timeout to expire. It can be
4059 * interrupted by a kill signal. The timeout is in jiffies.
4060 */
4061unsigned long __sched
4062wait_for_completion_killable_timeout(struct completion *x,
4063 unsigned long timeout)
4064{
4065 return wait_for_common(x, timeout, TASK_KILLABLE);
4066}
4067EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4068
4069/**
5936 * try_wait_for_completion - try to decrement a completion without blocking 4070 * try_wait_for_completion - try to decrement a completion without blocking
5937 * @x: completion structure 4071 * @x: completion structure
5938 * 4072 *
@@ -6043,14 +4177,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6043 unsigned long flags; 4177 unsigned long flags;
6044 int oldprio, on_rq, running; 4178 int oldprio, on_rq, running;
6045 struct rq *rq; 4179 struct rq *rq;
6046 const struct sched_class *prev_class = p->sched_class; 4180 const struct sched_class *prev_class;
6047 4181
6048 BUG_ON(prio < 0 || prio > MAX_PRIO); 4182 BUG_ON(prio < 0 || prio > MAX_PRIO);
6049 4183
6050 rq = task_rq_lock(p, &flags); 4184 rq = task_rq_lock(p, &flags);
6051 update_rq_clock(rq);
6052 4185
6053 oldprio = p->prio; 4186 oldprio = p->prio;
4187 prev_class = p->sched_class;
6054 on_rq = p->se.on_rq; 4188 on_rq = p->se.on_rq;
6055 running = task_current(rq, p); 4189 running = task_current(rq, p);
6056 if (on_rq) 4190 if (on_rq)
@@ -6068,7 +4202,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6068 if (running) 4202 if (running)
6069 p->sched_class->set_curr_task(rq); 4203 p->sched_class->set_curr_task(rq);
6070 if (on_rq) { 4204 if (on_rq) {
6071 enqueue_task(rq, p, 0); 4205 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
6072 4206
6073 check_class_changed(rq, p, prev_class, oldprio, running); 4207 check_class_changed(rq, p, prev_class, oldprio, running);
6074 } 4208 }
@@ -6090,7 +4224,6 @@ void set_user_nice(struct task_struct *p, long nice)
6090 * the task might be in the middle of scheduling on another CPU. 4224 * the task might be in the middle of scheduling on another CPU.
6091 */ 4225 */
6092 rq = task_rq_lock(p, &flags); 4226 rq = task_rq_lock(p, &flags);
6093 update_rq_clock(rq);
6094 /* 4227 /*
6095 * The RT priorities are set via sched_setscheduler(), but we still 4228 * The RT priorities are set via sched_setscheduler(), but we still
6096 * allow the 'normal' nice value to be set - but as expected 4229 * allow the 'normal' nice value to be set - but as expected
@@ -6135,7 +4268,7 @@ int can_nice(const struct task_struct *p, const int nice)
6135 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4268 /* convert nice value [19,-20] to rlimit style value [1,40] */
6136 int nice_rlim = 20 - nice; 4269 int nice_rlim = 20 - nice;
6137 4270
6138 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4271 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6139 capable(CAP_SYS_NICE)); 4272 capable(CAP_SYS_NICE));
6140} 4273}
6141 4274
@@ -6270,7 +4403,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6270{ 4403{
6271 int retval, oldprio, oldpolicy = -1, on_rq, running; 4404 int retval, oldprio, oldpolicy = -1, on_rq, running;
6272 unsigned long flags; 4405 unsigned long flags;
6273 const struct sched_class *prev_class = p->sched_class; 4406 const struct sched_class *prev_class;
6274 struct rq *rq; 4407 struct rq *rq;
6275 int reset_on_fork; 4408 int reset_on_fork;
6276 4409
@@ -6312,7 +4445,7 @@ recheck:
6312 4445
6313 if (!lock_task_sighand(p, &flags)) 4446 if (!lock_task_sighand(p, &flags))
6314 return -ESRCH; 4447 return -ESRCH;
6315 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6316 unlock_task_sighand(p, &flags); 4449 unlock_task_sighand(p, &flags);
6317 4450
6318 /* can't set/change the rt policy */ 4451 /* can't set/change the rt policy */
@@ -6341,16 +4474,6 @@ recheck:
6341 } 4474 }
6342 4475
6343 if (user) { 4476 if (user) {
6344#ifdef CONFIG_RT_GROUP_SCHED
6345 /*
6346 * Do not allow realtime tasks into groups that have no runtime
6347 * assigned.
6348 */
6349 if (rt_bandwidth_enabled() && rt_policy(policy) &&
6350 task_group(p)->rt_bandwidth.rt_runtime == 0)
6351 return -EPERM;
6352#endif
6353
6354 retval = security_task_setscheduler(p, policy, param); 4477 retval = security_task_setscheduler(p, policy, param);
6355 if (retval) 4478 if (retval)
6356 return retval; 4479 return retval;
@@ -6366,6 +4489,22 @@ recheck:
6366 * runqueue lock must be held. 4489 * runqueue lock must be held.
6367 */ 4490 */
6368 rq = __task_rq_lock(p); 4491 rq = __task_rq_lock(p);
4492
4493#ifdef CONFIG_RT_GROUP_SCHED
4494 if (user) {
4495 /*
4496 * Do not allow realtime tasks into groups that have no runtime
4497 * assigned.
4498 */
4499 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4500 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4501 __task_rq_unlock(rq);
4502 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4503 return -EPERM;
4504 }
4505 }
4506#endif
4507
6369 /* recheck policy now with rq lock held */ 4508 /* recheck policy now with rq lock held */
6370 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4509 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6371 policy = oldpolicy = -1; 4510 policy = oldpolicy = -1;
@@ -6373,7 +4512,6 @@ recheck:
6373 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4512 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6374 goto recheck; 4513 goto recheck;
6375 } 4514 }
6376 update_rq_clock(rq);
6377 on_rq = p->se.on_rq; 4515 on_rq = p->se.on_rq;
6378 running = task_current(rq, p); 4516 running = task_current(rq, p);
6379 if (on_rq) 4517 if (on_rq)
@@ -6384,6 +4522,7 @@ recheck:
6384 p->sched_reset_on_fork = reset_on_fork; 4522 p->sched_reset_on_fork = reset_on_fork;
6385 4523
6386 oldprio = p->prio; 4524 oldprio = p->prio;
4525 prev_class = p->sched_class;
6387 __setscheduler(rq, p, policy, param->sched_priority); 4526 __setscheduler(rq, p, policy, param->sched_priority);
6388 4527
6389 if (running) 4528 if (running)
@@ -6683,7 +4822,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6683 int ret; 4822 int ret;
6684 cpumask_var_t mask; 4823 cpumask_var_t mask;
6685 4824
6686 if (len < cpumask_size()) 4825 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4826 return -EINVAL;
4827 if (len & (sizeof(unsigned long)-1))
6687 return -EINVAL; 4828 return -EINVAL;
6688 4829
6689 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4830 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -6691,10 +4832,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6691 4832
6692 ret = sched_getaffinity(pid, mask); 4833 ret = sched_getaffinity(pid, mask);
6693 if (ret == 0) { 4834 if (ret == 0) {
6694 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 4835 size_t retlen = min_t(size_t, len, cpumask_size());
4836
4837 if (copy_to_user(user_mask_ptr, mask, retlen))
6695 ret = -EFAULT; 4838 ret = -EFAULT;
6696 else 4839 else
6697 ret = cpumask_size(); 4840 ret = retlen;
6698 } 4841 }
6699 free_cpumask_var(mask); 4842 free_cpumask_var(mask);
6700 4843
@@ -7105,17 +5248,15 @@ static inline void sched_init_granularity(void)
7105/* 5248/*
7106 * This is how migration works: 5249 * This is how migration works:
7107 * 5250 *
7108 * 1) we queue a struct migration_req structure in the source CPU's 5251 * 1) we invoke migration_cpu_stop() on the target CPU using
7109 * runqueue and wake up that CPU's migration thread. 5252 * stop_one_cpu().
7110 * 2) we down() the locked semaphore => thread blocks. 5253 * 2) stopper starts to run (implicitly forcing the migrated thread
7111 * 3) migration thread wakes up (implicitly it forces the migrated 5254 * off the CPU)
7112 * thread off the CPU) 5255 * 3) it checks whether the migrated task is still in the wrong runqueue.
7113 * 4) it gets the migration request and checks whether the migrated 5256 * 4) if it's in the wrong runqueue then the migration thread removes
7114 * task is still in the wrong runqueue.
7115 * 5) if it's in the wrong runqueue then the migration thread removes
7116 * it and puts it into the right queue. 5257 * it and puts it into the right queue.
7117 * 6) migration thread up()s the semaphore. 5258 * 5) stopper completes and stop_one_cpu() returns and the migration
7118 * 7) we wake up and the migration is done. 5259 * is done.
7119 */ 5260 */
7120 5261
7121/* 5262/*
@@ -7129,24 +5270,20 @@ static inline void sched_init_granularity(void)
7129 */ 5270 */
7130int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5271int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7131{ 5272{
7132 struct migration_req req;
7133 unsigned long flags; 5273 unsigned long flags;
7134 struct rq *rq; 5274 struct rq *rq;
5275 unsigned int dest_cpu;
7135 int ret = 0; 5276 int ret = 0;
7136 5277
7137 /* 5278 /*
7138 * Since we rely on wake-ups to migrate sleeping tasks, don't change 5279 * Serialize against TASK_WAKING so that ttwu() and wunt() can
7139 * the ->cpus_allowed mask from under waking tasks, which would be 5280 * drop the rq->lock and still rely on ->cpus_allowed.
7140 * possible when we change rq->lock in ttwu(), so synchronize against
7141 * TASK_WAKING to avoid that.
7142 */ 5281 */
7143again: 5282again:
7144 while (p->state == TASK_WAKING) 5283 while (task_is_waking(p))
7145 cpu_relax(); 5284 cpu_relax();
7146
7147 rq = task_rq_lock(p, &flags); 5285 rq = task_rq_lock(p, &flags);
7148 5286 if (task_is_waking(p)) {
7149 if (p->state == TASK_WAKING) {
7150 task_rq_unlock(rq, &flags); 5287 task_rq_unlock(rq, &flags);
7151 goto again; 5288 goto again;
7152 } 5289 }
@@ -7173,15 +5310,12 @@ again:
7173 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5310 if (cpumask_test_cpu(task_cpu(p), new_mask))
7174 goto out; 5311 goto out;
7175 5312
7176 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5313 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5314 if (migrate_task(p, dest_cpu)) {
5315 struct migration_arg arg = { p, dest_cpu };
7177 /* Need help from migration thread: drop lock and wait. */ 5316 /* Need help from migration thread: drop lock and wait. */
7178 struct task_struct *mt = rq->migration_thread;
7179
7180 get_task_struct(mt);
7181 task_rq_unlock(rq, &flags); 5317 task_rq_unlock(rq, &flags);
7182 wake_up_process(rq->migration_thread); 5318 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
7183 put_task_struct(mt);
7184 wait_for_completion(&req.done);
7185 tlb_migrate_finish(p->mm); 5319 tlb_migrate_finish(p->mm);
7186 return 0; 5320 return 0;
7187 } 5321 }
@@ -7239,98 +5373,49 @@ fail:
7239 return ret; 5373 return ret;
7240} 5374}
7241 5375
7242#define RCU_MIGRATION_IDLE 0
7243#define RCU_MIGRATION_NEED_QS 1
7244#define RCU_MIGRATION_GOT_QS 2
7245#define RCU_MIGRATION_MUST_SYNC 3
7246
7247/* 5376/*
7248 * migration_thread - this is a highprio system thread that performs 5377 * migration_cpu_stop - this will be executed by a highprio stopper thread
7249 * thread migration by bumping thread off CPU then 'pushing' onto 5378 * and performs thread migration by bumping thread off CPU then
7250 * another runqueue. 5379 * 'pushing' onto another runqueue.
7251 */ 5380 */
7252static int migration_thread(void *data) 5381static int migration_cpu_stop(void *data)
7253{ 5382{
7254 int badcpu; 5383 struct migration_arg *arg = data;
7255 int cpu = (long)data;
7256 struct rq *rq;
7257
7258 rq = cpu_rq(cpu);
7259 BUG_ON(rq->migration_thread != current);
7260
7261 set_current_state(TASK_INTERRUPTIBLE);
7262 while (!kthread_should_stop()) {
7263 struct migration_req *req;
7264 struct list_head *head;
7265
7266 raw_spin_lock_irq(&rq->lock);
7267
7268 if (cpu_is_offline(cpu)) {
7269 raw_spin_unlock_irq(&rq->lock);
7270 break;
7271 }
7272
7273 if (rq->active_balance) {
7274 active_load_balance(rq, cpu);
7275 rq->active_balance = 0;
7276 }
7277
7278 head = &rq->migration_queue;
7279
7280 if (list_empty(head)) {
7281 raw_spin_unlock_irq(&rq->lock);
7282 schedule();
7283 set_current_state(TASK_INTERRUPTIBLE);
7284 continue;
7285 }
7286 req = list_entry(head->next, struct migration_req, list);
7287 list_del_init(head->next);
7288
7289 if (req->task != NULL) {
7290 raw_spin_unlock(&rq->lock);
7291 __migrate_task(req->task, cpu, req->dest_cpu);
7292 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7293 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7294 raw_spin_unlock(&rq->lock);
7295 } else {
7296 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7297 raw_spin_unlock(&rq->lock);
7298 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7299 }
7300 local_irq_enable();
7301
7302 complete(&req->done);
7303 }
7304 __set_current_state(TASK_RUNNING);
7305
7306 return 0;
7307}
7308
7309#ifdef CONFIG_HOTPLUG_CPU
7310
7311static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7312{
7313 int ret;
7314 5384
5385 /*
5386 * The original target cpu might have gone down and we might
5387 * be on another cpu but it doesn't matter.
5388 */
7315 local_irq_disable(); 5389 local_irq_disable();
7316 ret = __migrate_task(p, src_cpu, dest_cpu); 5390 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
7317 local_irq_enable(); 5391 local_irq_enable();
7318 return ret; 5392 return 0;
7319} 5393}
7320 5394
5395#ifdef CONFIG_HOTPLUG_CPU
7321/* 5396/*
7322 * Figure out where task on dead CPU should go, use force if necessary. 5397 * Figure out where task on dead CPU should go, use force if necessary.
7323 */ 5398 */
7324static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5399void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7325{ 5400{
7326 int dest_cpu; 5401 struct rq *rq = cpu_rq(dead_cpu);
5402 int needs_cpu, uninitialized_var(dest_cpu);
5403 unsigned long flags;
7327 5404
7328again: 5405 local_irq_save(flags);
7329 dest_cpu = select_fallback_rq(dead_cpu, p);
7330 5406
7331 /* It can have affinity changed while we were choosing. */ 5407 raw_spin_lock(&rq->lock);
7332 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5408 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
7333 goto again; 5409 if (needs_cpu)
5410 dest_cpu = select_fallback_rq(dead_cpu, p);
5411 raw_spin_unlock(&rq->lock);
5412 /*
5413 * It can only fail if we race with set_cpus_allowed(),
5414 * in the racer should migrate the task anyway.
5415 */
5416 if (needs_cpu)
5417 __migrate_task(p, dead_cpu, dest_cpu);
5418 local_irq_restore(flags);
7334} 5419}
7335 5420
7336/* 5421/*
@@ -7394,7 +5479,6 @@ void sched_idle_next(void)
7394 5479
7395 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5480 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7396 5481
7397 update_rq_clock(rq);
7398 activate_task(rq, p, 0); 5482 activate_task(rq, p, 0);
7399 5483
7400 raw_spin_unlock_irqrestore(&rq->lock, flags); 5484 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -7449,7 +5533,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7449 for ( ; ; ) { 5533 for ( ; ; ) {
7450 if (!rq->nr_running) 5534 if (!rq->nr_running)
7451 break; 5535 break;
7452 update_rq_clock(rq);
7453 next = pick_next_task(rq); 5536 next = pick_next_task(rq);
7454 if (!next) 5537 if (!next)
7455 break; 5538 break;
@@ -7672,35 +5755,20 @@ static void set_rq_offline(struct rq *rq)
7672static int __cpuinit 5755static int __cpuinit
7673migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5756migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7674{ 5757{
7675 struct task_struct *p;
7676 int cpu = (long)hcpu; 5758 int cpu = (long)hcpu;
7677 unsigned long flags; 5759 unsigned long flags;
7678 struct rq *rq; 5760 struct rq *rq = cpu_rq(cpu);
7679 5761
7680 switch (action) { 5762 switch (action) {
7681 5763
7682 case CPU_UP_PREPARE: 5764 case CPU_UP_PREPARE:
7683 case CPU_UP_PREPARE_FROZEN: 5765 case CPU_UP_PREPARE_FROZEN:
7684 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
7685 if (IS_ERR(p))
7686 return NOTIFY_BAD;
7687 kthread_bind(p, cpu);
7688 /* Must be high prio: stop_machine expects to yield to it. */
7689 rq = task_rq_lock(p, &flags);
7690 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7691 task_rq_unlock(rq, &flags);
7692 get_task_struct(p);
7693 cpu_rq(cpu)->migration_thread = p;
7694 rq->calc_load_update = calc_load_update; 5766 rq->calc_load_update = calc_load_update;
7695 break; 5767 break;
7696 5768
7697 case CPU_ONLINE: 5769 case CPU_ONLINE:
7698 case CPU_ONLINE_FROZEN: 5770 case CPU_ONLINE_FROZEN:
7699 /* Strictly unnecessary, as first user will wake it. */
7700 wake_up_process(cpu_rq(cpu)->migration_thread);
7701
7702 /* Update our root-domain */ 5771 /* Update our root-domain */
7703 rq = cpu_rq(cpu);
7704 raw_spin_lock_irqsave(&rq->lock, flags); 5772 raw_spin_lock_irqsave(&rq->lock, flags);
7705 if (rq->rd) { 5773 if (rq->rd) {
7706 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5774 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7711,61 +5779,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7711 break; 5779 break;
7712 5780
7713#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
7714 case CPU_UP_CANCELED:
7715 case CPU_UP_CANCELED_FROZEN:
7716 if (!cpu_rq(cpu)->migration_thread)
7717 break;
7718 /* Unbind it from offline cpu so it can run. Fall thru. */
7719 kthread_bind(cpu_rq(cpu)->migration_thread,
7720 cpumask_any(cpu_online_mask));
7721 kthread_stop(cpu_rq(cpu)->migration_thread);
7722 put_task_struct(cpu_rq(cpu)->migration_thread);
7723 cpu_rq(cpu)->migration_thread = NULL;
7724 break;
7725
7726 case CPU_DEAD: 5782 case CPU_DEAD:
7727 case CPU_DEAD_FROZEN: 5783 case CPU_DEAD_FROZEN:
7728 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
7729 migrate_live_tasks(cpu); 5784 migrate_live_tasks(cpu);
7730 rq = cpu_rq(cpu);
7731 kthread_stop(rq->migration_thread);
7732 put_task_struct(rq->migration_thread);
7733 rq->migration_thread = NULL;
7734 /* Idle task back to normal (off runqueue, low prio) */ 5785 /* Idle task back to normal (off runqueue, low prio) */
7735 raw_spin_lock_irq(&rq->lock); 5786 raw_spin_lock_irq(&rq->lock);
7736 update_rq_clock(rq);
7737 deactivate_task(rq, rq->idle, 0); 5787 deactivate_task(rq, rq->idle, 0);
7738 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5788 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7739 rq->idle->sched_class = &idle_sched_class; 5789 rq->idle->sched_class = &idle_sched_class;
7740 migrate_dead_tasks(cpu); 5790 migrate_dead_tasks(cpu);
7741 raw_spin_unlock_irq(&rq->lock); 5791 raw_spin_unlock_irq(&rq->lock);
7742 cpuset_unlock();
7743 migrate_nr_uninterruptible(rq); 5792 migrate_nr_uninterruptible(rq);
7744 BUG_ON(rq->nr_running != 0); 5793 BUG_ON(rq->nr_running != 0);
7745 calc_global_load_remove(rq); 5794 calc_global_load_remove(rq);
7746 /*
7747 * No need to migrate the tasks: it was best-effort if
7748 * they didn't take sched_hotcpu_mutex. Just wake up
7749 * the requestors.
7750 */
7751 raw_spin_lock_irq(&rq->lock);
7752 while (!list_empty(&rq->migration_queue)) {
7753 struct migration_req *req;
7754
7755 req = list_entry(rq->migration_queue.next,
7756 struct migration_req, list);
7757 list_del_init(&req->list);
7758 raw_spin_unlock_irq(&rq->lock);
7759 complete(&req->done);
7760 raw_spin_lock_irq(&rq->lock);
7761 }
7762 raw_spin_unlock_irq(&rq->lock);
7763 break; 5795 break;
7764 5796
7765 case CPU_DYING: 5797 case CPU_DYING:
7766 case CPU_DYING_FROZEN: 5798 case CPU_DYING_FROZEN:
7767 /* Update our root-domain */ 5799 /* Update our root-domain */
7768 rq = cpu_rq(cpu);
7769 raw_spin_lock_irqsave(&rq->lock, flags); 5800 raw_spin_lock_irqsave(&rq->lock, flags);
7770 if (rq->rd) { 5801 if (rq->rd) {
7771 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5802 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -8096,6 +6127,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
8096 struct rq *rq = cpu_rq(cpu); 6127 struct rq *rq = cpu_rq(cpu);
8097 struct sched_domain *tmp; 6128 struct sched_domain *tmp;
8098 6129
6130 for (tmp = sd; tmp; tmp = tmp->parent)
6131 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6132
8099 /* Remove the sched domains which do not contribute to scheduling. */ 6133 /* Remove the sched domains which do not contribute to scheduling. */
8100 for (tmp = sd; tmp; ) { 6134 for (tmp = sd; tmp; ) {
8101 struct sched_domain *parent = tmp->parent; 6135 struct sched_domain *parent = tmp->parent;
@@ -9202,11 +7236,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9202 7236
9203#ifdef CONFIG_SCHED_MC 7237#ifdef CONFIG_SCHED_MC
9204static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7238static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7239 struct sysdev_class_attribute *attr,
9205 char *page) 7240 char *page)
9206{ 7241{
9207 return sprintf(page, "%u\n", sched_mc_power_savings); 7242 return sprintf(page, "%u\n", sched_mc_power_savings);
9208} 7243}
9209static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7244static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7245 struct sysdev_class_attribute *attr,
9210 const char *buf, size_t count) 7246 const char *buf, size_t count)
9211{ 7247{
9212 return sched_power_savings_store(buf, count, 0); 7248 return sched_power_savings_store(buf, count, 0);
@@ -9218,11 +7254,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9218 7254
9219#ifdef CONFIG_SCHED_SMT 7255#ifdef CONFIG_SCHED_SMT
9220static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7256static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7257 struct sysdev_class_attribute *attr,
9221 char *page) 7258 char *page)
9222{ 7259{
9223 return sprintf(page, "%u\n", sched_smt_power_savings); 7260 return sprintf(page, "%u\n", sched_smt_power_savings);
9224} 7261}
9225static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7262static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7263 struct sysdev_class_attribute *attr,
9226 const char *buf, size_t count) 7264 const char *buf, size_t count)
9227{ 7265{
9228 return sched_power_savings_store(buf, count, 1); 7266 return sched_power_savings_store(buf, count, 1);
@@ -9437,7 +7475,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9437 tg->rt_rq[cpu] = rt_rq; 7475 tg->rt_rq[cpu] = rt_rq;
9438 init_rt_rq(rt_rq, rq); 7476 init_rt_rq(rt_rq, rq);
9439 rt_rq->tg = tg; 7477 rt_rq->tg = tg;
9440 rt_rq->rt_se = rt_se;
9441 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7478 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9442 if (add) 7479 if (add)
9443 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7480 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9468,9 +7505,6 @@ void __init sched_init(void)
9468#ifdef CONFIG_RT_GROUP_SCHED 7505#ifdef CONFIG_RT_GROUP_SCHED
9469 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7506 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9470#endif 7507#endif
9471#ifdef CONFIG_USER_SCHED
9472 alloc_size *= 2;
9473#endif
9474#ifdef CONFIG_CPUMASK_OFFSTACK 7508#ifdef CONFIG_CPUMASK_OFFSTACK
9475 alloc_size += num_possible_cpus() * cpumask_size(); 7509 alloc_size += num_possible_cpus() * cpumask_size();
9476#endif 7510#endif
@@ -9484,13 +7518,6 @@ void __init sched_init(void)
9484 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7518 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9485 ptr += nr_cpu_ids * sizeof(void **); 7519 ptr += nr_cpu_ids * sizeof(void **);
9486 7520
9487#ifdef CONFIG_USER_SCHED
9488 root_task_group.se = (struct sched_entity **)ptr;
9489 ptr += nr_cpu_ids * sizeof(void **);
9490
9491 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9492 ptr += nr_cpu_ids * sizeof(void **);
9493#endif /* CONFIG_USER_SCHED */
9494#endif /* CONFIG_FAIR_GROUP_SCHED */ 7521#endif /* CONFIG_FAIR_GROUP_SCHED */
9495#ifdef CONFIG_RT_GROUP_SCHED 7522#ifdef CONFIG_RT_GROUP_SCHED
9496 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7523 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9499,13 +7526,6 @@ void __init sched_init(void)
9499 init_task_group.rt_rq = (struct rt_rq **)ptr; 7526 init_task_group.rt_rq = (struct rt_rq **)ptr;
9500 ptr += nr_cpu_ids * sizeof(void **); 7527 ptr += nr_cpu_ids * sizeof(void **);
9501 7528
9502#ifdef CONFIG_USER_SCHED
9503 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9504 ptr += nr_cpu_ids * sizeof(void **);
9505
9506 root_task_group.rt_rq = (struct rt_rq **)ptr;
9507 ptr += nr_cpu_ids * sizeof(void **);
9508#endif /* CONFIG_USER_SCHED */
9509#endif /* CONFIG_RT_GROUP_SCHED */ 7529#endif /* CONFIG_RT_GROUP_SCHED */
9510#ifdef CONFIG_CPUMASK_OFFSTACK 7530#ifdef CONFIG_CPUMASK_OFFSTACK
9511 for_each_possible_cpu(i) { 7531 for_each_possible_cpu(i) {
@@ -9525,22 +7545,13 @@ void __init sched_init(void)
9525#ifdef CONFIG_RT_GROUP_SCHED 7545#ifdef CONFIG_RT_GROUP_SCHED
9526 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7546 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9527 global_rt_period(), global_rt_runtime()); 7547 global_rt_period(), global_rt_runtime());
9528#ifdef CONFIG_USER_SCHED
9529 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9530 global_rt_period(), RUNTIME_INF);
9531#endif /* CONFIG_USER_SCHED */
9532#endif /* CONFIG_RT_GROUP_SCHED */ 7548#endif /* CONFIG_RT_GROUP_SCHED */
9533 7549
9534#ifdef CONFIG_GROUP_SCHED 7550#ifdef CONFIG_CGROUP_SCHED
9535 list_add(&init_task_group.list, &task_groups); 7551 list_add(&init_task_group.list, &task_groups);
9536 INIT_LIST_HEAD(&init_task_group.children); 7552 INIT_LIST_HEAD(&init_task_group.children);
9537 7553
9538#ifdef CONFIG_USER_SCHED 7554#endif /* CONFIG_CGROUP_SCHED */
9539 INIT_LIST_HEAD(&root_task_group.children);
9540 init_task_group.parent = &root_task_group;
9541 list_add(&init_task_group.siblings, &root_task_group.children);
9542#endif /* CONFIG_USER_SCHED */
9543#endif /* CONFIG_GROUP_SCHED */
9544 7555
9545#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7556#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9546 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7557 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9580,25 +7591,6 @@ void __init sched_init(void)
9580 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7591 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9581 */ 7592 */
9582 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7593 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9583#elif defined CONFIG_USER_SCHED
9584 root_task_group.shares = NICE_0_LOAD;
9585 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9586 /*
9587 * In case of task-groups formed thr' the user id of tasks,
9588 * init_task_group represents tasks belonging to root user.
9589 * Hence it forms a sibling of all subsequent groups formed.
9590 * In this case, init_task_group gets only a fraction of overall
9591 * system cpu resource, based on the weight assigned to root
9592 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9593 * by letting tasks of init_task_group sit in a separate cfs_rq
9594 * (init_tg_cfs_rq) and having one entity represent this group of
9595 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9596 */
9597 init_tg_cfs_entry(&init_task_group,
9598 &per_cpu(init_tg_cfs_rq, i),
9599 &per_cpu(init_sched_entity, i), i, 1,
9600 root_task_group.se[i]);
9601
9602#endif 7594#endif
9603#endif /* CONFIG_FAIR_GROUP_SCHED */ 7595#endif /* CONFIG_FAIR_GROUP_SCHED */
9604 7596
@@ -9607,12 +7599,6 @@ void __init sched_init(void)
9607 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7599 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9608#ifdef CONFIG_CGROUP_SCHED 7600#ifdef CONFIG_CGROUP_SCHED
9609 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7601 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9610#elif defined CONFIG_USER_SCHED
9611 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9612 init_tg_rt_entry(&init_task_group,
9613 &per_cpu(init_rt_rq_var, i),
9614 &per_cpu(init_sched_rt_entity, i), i, 1,
9615 root_task_group.rt_se[i]);
9616#endif 7602#endif
9617#endif 7603#endif
9618 7604
@@ -9621,16 +7607,15 @@ void __init sched_init(void)
9621#ifdef CONFIG_SMP 7607#ifdef CONFIG_SMP
9622 rq->sd = NULL; 7608 rq->sd = NULL;
9623 rq->rd = NULL; 7609 rq->rd = NULL;
7610 rq->cpu_power = SCHED_LOAD_SCALE;
9624 rq->post_schedule = 0; 7611 rq->post_schedule = 0;
9625 rq->active_balance = 0; 7612 rq->active_balance = 0;
9626 rq->next_balance = jiffies; 7613 rq->next_balance = jiffies;
9627 rq->push_cpu = 0; 7614 rq->push_cpu = 0;
9628 rq->cpu = i; 7615 rq->cpu = i;
9629 rq->online = 0; 7616 rq->online = 0;
9630 rq->migration_thread = NULL;
9631 rq->idle_stamp = 0; 7617 rq->idle_stamp = 0;
9632 rq->avg_idle = 2*sysctl_sched_migration_cost; 7618 rq->avg_idle = 2*sysctl_sched_migration_cost;
9633 INIT_LIST_HEAD(&rq->migration_queue);
9634 rq_attach_root(rq, &def_root_domain); 7619 rq_attach_root(rq, &def_root_domain);
9635#endif 7620#endif
9636 init_rq_hrtick(rq); 7621 init_rq_hrtick(rq);
@@ -9697,7 +7682,7 @@ static inline int preempt_count_equals(int preempt_offset)
9697 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7682 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9698} 7683}
9699 7684
9700void __might_sleep(char *file, int line, int preempt_offset) 7685void __might_sleep(const char *file, int line, int preempt_offset)
9701{ 7686{
9702#ifdef in_atomic 7687#ifdef in_atomic
9703 static unsigned long prev_jiffy; /* ratelimiting */ 7688 static unsigned long prev_jiffy; /* ratelimiting */
@@ -9731,7 +7716,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
9731{ 7716{
9732 int on_rq; 7717 int on_rq;
9733 7718
9734 update_rq_clock(rq);
9735 on_rq = p->se.on_rq; 7719 on_rq = p->se.on_rq;
9736 if (on_rq) 7720 if (on_rq)
9737 deactivate_task(rq, p, 0); 7721 deactivate_task(rq, p, 0);
@@ -9758,9 +7742,9 @@ void normalize_rt_tasks(void)
9758 7742
9759 p->se.exec_start = 0; 7743 p->se.exec_start = 0;
9760#ifdef CONFIG_SCHEDSTATS 7744#ifdef CONFIG_SCHEDSTATS
9761 p->se.wait_start = 0; 7745 p->se.statistics.wait_start = 0;
9762 p->se.sleep_start = 0; 7746 p->se.statistics.sleep_start = 0;
9763 p->se.block_start = 0; 7747 p->se.statistics.block_start = 0;
9764#endif 7748#endif
9765 7749
9766 if (!rt_task(p)) { 7750 if (!rt_task(p)) {
@@ -9787,9 +7771,9 @@ void normalize_rt_tasks(void)
9787 7771
9788#endif /* CONFIG_MAGIC_SYSRQ */ 7772#endif /* CONFIG_MAGIC_SYSRQ */
9789 7773
9790#ifdef CONFIG_IA64 7774#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9791/* 7775/*
9792 * These functions are only useful for the IA64 MCA handling. 7776 * These functions are only useful for the IA64 MCA handling, or kdb.
9793 * 7777 *
9794 * They can only be called when the whole system has been 7778 * They can only be called when the whole system has been
9795 * stopped - every CPU needs to be quiescent, and no scheduling 7779 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -9809,6 +7793,9 @@ struct task_struct *curr_task(int cpu)
9809 return cpu_curr(cpu); 7793 return cpu_curr(cpu);
9810} 7794}
9811 7795
7796#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7797
7798#ifdef CONFIG_IA64
9812/** 7799/**
9813 * set_curr_task - set the current task for a given cpu. 7800 * set_curr_task - set the current task for a given cpu.
9814 * @cpu: the processor in question. 7801 * @cpu: the processor in question.
@@ -10008,7 +7995,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10008} 7995}
10009#endif /* CONFIG_RT_GROUP_SCHED */ 7996#endif /* CONFIG_RT_GROUP_SCHED */
10010 7997
10011#ifdef CONFIG_GROUP_SCHED 7998#ifdef CONFIG_CGROUP_SCHED
10012static void free_sched_group(struct task_group *tg) 7999static void free_sched_group(struct task_group *tg)
10013{ 8000{
10014 free_fair_sched_group(tg); 8001 free_fair_sched_group(tg);
@@ -10093,8 +8080,6 @@ void sched_move_task(struct task_struct *tsk)
10093 8080
10094 rq = task_rq_lock(tsk, &flags); 8081 rq = task_rq_lock(tsk, &flags);
10095 8082
10096 update_rq_clock(rq);
10097
10098 running = task_current(rq, tsk); 8083 running = task_current(rq, tsk);
10099 on_rq = tsk->se.on_rq; 8084 on_rq = tsk->se.on_rq;
10100 8085
@@ -10117,7 +8102,7 @@ void sched_move_task(struct task_struct *tsk)
10117 8102
10118 task_rq_unlock(rq, &flags); 8103 task_rq_unlock(rq, &flags);
10119} 8104}
10120#endif /* CONFIG_GROUP_SCHED */ 8105#endif /* CONFIG_CGROUP_SCHED */
10121 8106
10122#ifdef CONFIG_FAIR_GROUP_SCHED 8107#ifdef CONFIG_FAIR_GROUP_SCHED
10123static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8108static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10259,13 +8244,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10259 runtime = d->rt_runtime; 8244 runtime = d->rt_runtime;
10260 } 8245 }
10261 8246
10262#ifdef CONFIG_USER_SCHED
10263 if (tg == &root_task_group) {
10264 period = global_rt_period();
10265 runtime = global_rt_runtime();
10266 }
10267#endif
10268
10269 /* 8247 /*
10270 * Cannot have more runtime than the period. 8248 * Cannot have more runtime than the period.
10271 */ 8249 */
@@ -10668,7 +8646,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10668struct cpuacct { 8646struct cpuacct {
10669 struct cgroup_subsys_state css; 8647 struct cgroup_subsys_state css;
10670 /* cpuusage holds pointer to a u64-type object on every cpu */ 8648 /* cpuusage holds pointer to a u64-type object on every cpu */
10671 u64 *cpuusage; 8649 u64 __percpu *cpuusage;
10672 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8650 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10673 struct cpuacct *parent; 8651 struct cpuacct *parent;
10674}; 8652};
@@ -10885,12 +8863,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10885} 8863}
10886 8864
10887/* 8865/*
8866 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
8867 * in cputime_t units. As a result, cpuacct_update_stats calls
8868 * percpu_counter_add with values large enough to always overflow the
8869 * per cpu batch limit causing bad SMP scalability.
8870 *
8871 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
8872 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
8873 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
8874 */
8875#ifdef CONFIG_SMP
8876#define CPUACCT_BATCH \
8877 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
8878#else
8879#define CPUACCT_BATCH 0
8880#endif
8881
8882/*
10888 * Charge the system/user time to the task's accounting group. 8883 * Charge the system/user time to the task's accounting group.
10889 */ 8884 */
10890static void cpuacct_update_stats(struct task_struct *tsk, 8885static void cpuacct_update_stats(struct task_struct *tsk,
10891 enum cpuacct_stat_index idx, cputime_t val) 8886 enum cpuacct_stat_index idx, cputime_t val)
10892{ 8887{
10893 struct cpuacct *ca; 8888 struct cpuacct *ca;
8889 int batch = CPUACCT_BATCH;
10894 8890
10895 if (unlikely(!cpuacct_subsys.active)) 8891 if (unlikely(!cpuacct_subsys.active))
10896 return; 8892 return;
@@ -10899,7 +8895,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10899 ca = task_ca(tsk); 8895 ca = task_ca(tsk);
10900 8896
10901 do { 8897 do {
10902 percpu_counter_add(&ca->cpustat[idx], val); 8898 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10903 ca = ca->parent; 8899 ca = ca->parent;
10904 } while (ca); 8900 } while (ca);
10905 rcu_read_unlock(); 8901 rcu_read_unlock();
@@ -10916,43 +8912,32 @@ struct cgroup_subsys cpuacct_subsys = {
10916 8912
10917#ifndef CONFIG_SMP 8913#ifndef CONFIG_SMP
10918 8914
10919int rcu_expedited_torture_stats(char *page)
10920{
10921 return 0;
10922}
10923EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10924
10925void synchronize_sched_expedited(void) 8915void synchronize_sched_expedited(void)
10926{ 8916{
8917 barrier();
10927} 8918}
10928EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8919EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10929 8920
10930#else /* #ifndef CONFIG_SMP */ 8921#else /* #ifndef CONFIG_SMP */
10931 8922
10932static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8923static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
10933static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10934
10935#define RCU_EXPEDITED_STATE_POST -2
10936#define RCU_EXPEDITED_STATE_IDLE -1
10937
10938static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10939 8924
10940int rcu_expedited_torture_stats(char *page) 8925static int synchronize_sched_expedited_cpu_stop(void *data)
10941{ 8926{
10942 int cnt = 0; 8927 /*
10943 int cpu; 8928 * There must be a full memory barrier on each affected CPU
10944 8929 * between the time that try_stop_cpus() is called and the
10945 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8930 * time that it returns.
10946 for_each_online_cpu(cpu) { 8931 *
10947 cnt += sprintf(&page[cnt], " %d:%d", 8932 * In the current initial implementation of cpu_stop, the
10948 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8933 * above condition is already met when the control reaches
10949 } 8934 * this point and the following smp_mb() is not strictly
10950 cnt += sprintf(&page[cnt], "\n"); 8935 * necessary. Do smp_mb() anyway for documentation and
10951 return cnt; 8936 * robustness against future implementation changes.
8937 */
8938 smp_mb(); /* See above comment block. */
8939 return 0;
10952} 8940}
10953EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10954
10955static long synchronize_sched_expedited_count;
10956 8941
10957/* 8942/*
10958 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8943 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -10966,18 +8951,14 @@ static long synchronize_sched_expedited_count;
10966 */ 8951 */
10967void synchronize_sched_expedited(void) 8952void synchronize_sched_expedited(void)
10968{ 8953{
10969 int cpu; 8954 int snap, trycount = 0;
10970 unsigned long flags;
10971 bool need_full_sync = 0;
10972 struct rq *rq;
10973 struct migration_req *req;
10974 long snap;
10975 int trycount = 0;
10976 8955
10977 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8956 smp_mb(); /* ensure prior mod happens before capturing snap. */
10978 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8957 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
10979 get_online_cpus(); 8958 get_online_cpus();
10980 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8959 while (try_stop_cpus(cpu_online_mask,
8960 synchronize_sched_expedited_cpu_stop,
8961 NULL) == -EAGAIN) {
10981 put_online_cpus(); 8962 put_online_cpus();
10982 if (trycount++ < 10) 8963 if (trycount++ < 10)
10983 udelay(trycount * num_online_cpus()); 8964 udelay(trycount * num_online_cpus());
@@ -10985,41 +8966,15 @@ void synchronize_sched_expedited(void)
10985 synchronize_sched(); 8966 synchronize_sched();
10986 return; 8967 return;
10987 } 8968 }
10988 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8969 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
10989 smp_mb(); /* ensure test happens before caller kfree */ 8970 smp_mb(); /* ensure test happens before caller kfree */
10990 return; 8971 return;
10991 } 8972 }
10992 get_online_cpus(); 8973 get_online_cpus();
10993 } 8974 }
10994 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8975 atomic_inc(&synchronize_sched_expedited_count);
10995 for_each_online_cpu(cpu) { 8976 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
10996 rq = cpu_rq(cpu);
10997 req = &per_cpu(rcu_migration_req, cpu);
10998 init_completion(&req->done);
10999 req->task = NULL;
11000 req->dest_cpu = RCU_MIGRATION_NEED_QS;
11001 raw_spin_lock_irqsave(&rq->lock, flags);
11002 list_add(&req->list, &rq->migration_queue);
11003 raw_spin_unlock_irqrestore(&rq->lock, flags);
11004 wake_up_process(rq->migration_thread);
11005 }
11006 for_each_online_cpu(cpu) {
11007 rcu_expedited_state = cpu;
11008 req = &per_cpu(rcu_migration_req, cpu);
11009 rq = cpu_rq(cpu);
11010 wait_for_completion(&req->done);
11011 raw_spin_lock_irqsave(&rq->lock, flags);
11012 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
11013 need_full_sync = 1;
11014 req->dest_cpu = RCU_MIGRATION_IDLE;
11015 raw_spin_unlock_irqrestore(&rq->lock, flags);
11016 }
11017 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
11018 synchronize_sched_expedited_count++;
11019 mutex_unlock(&rcu_sched_expedited_mutex);
11020 put_online_cpus(); 8977 put_online_cpus();
11021 if (need_full_sync)
11022 synchronize_sched();
11023} 8978}
11024EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8979EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
11025 8980
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30#include <linux/gfp.h>
30#include "sched_cpupri.h" 31#include "sched_cpupri.h"
31 32
32/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -47,9 +48,7 @@ static int convert_prio(int prio)
47} 48}
48 49
49#define for_each_cpupri_active(array, idx) \ 50#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 51 for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 52
54/** 53/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 54 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -58,7 +57,7 @@ static int convert_prio(int prio)
58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 57 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 58 *
60 * Note: This function returns the recommended CPUs as calculated during the 59 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 60 * current invocation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not 61 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct 62 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current 63 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
174 176
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 178#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 180#endif
@@ -384,15 +381,9 @@ __initcall(init_sched_debug_procfs);
384void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 381void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
385{ 382{
386 unsigned long nr_switches; 383 unsigned long nr_switches;
387 unsigned long flags;
388 int num_threads = 1;
389
390 if (lock_task_sighand(p, &flags)) {
391 num_threads = atomic_read(&p->signal->count);
392 unlock_task_sighand(p, &flags);
393 }
394 384
395 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 385 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
386 get_nr_threads(p));
396 SEQ_printf(m, 387 SEQ_printf(m,
397 "---------------------------------------------------------\n"); 388 "---------------------------------------------------------\n");
398#define __P(F) \ 389#define __P(F) \
@@ -407,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 398 PN(se.exec_start);
408 PN(se.vruntime); 399 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 400 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 401
413 nr_switches = p->nvcsw + p->nivcsw; 402 nr_switches = p->nvcsw + p->nivcsw;
414 403
415#ifdef CONFIG_SCHEDSTATS 404#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 405 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 406 PN(se.statistics.sleep_start);
418 PN(se.block_start); 407 PN(se.statistics.block_start);
419 PN(se.sleep_max); 408 PN(se.statistics.sleep_max);
420 PN(se.block_max); 409 PN(se.statistics.block_max);
421 PN(se.exec_max); 410 PN(se.statistics.exec_max);
422 PN(se.slice_max); 411 PN(se.statistics.slice_max);
423 PN(se.wait_max); 412 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 413 PN(se.statistics.wait_sum);
425 P(se.wait_count); 414 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 415 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 416 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 417 P(sched_info.bkl_count);
429 P(se.nr_migrations); 418 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 419 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 420 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 421 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 422 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 423 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 424 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 425 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 426 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 427 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 428 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 429 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 430 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 431 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 432 P(se.statistics.nr_wakeups_idle);
444 433
445 { 434 {
446 u64 avg_atom, avg_per_cpu; 435 u64 avg_atom, avg_per_cpu;
@@ -491,35 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 480void proc_sched_set_task(struct task_struct *p)
492{ 481{
493#ifdef CONFIG_SCHEDSTATS 482#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 483 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 484#endif
521 p->se.sum_exec_runtime = 0;
522 p->se.prev_sum_exec_runtime = 0;
523 p->nvcsw = 0;
524 p->nivcsw = 0;
525} 485}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..a878b5332daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1053,16 +1041,11 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1041 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1044static void
1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1057{ 1046{
1058 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1060 int flags = 0;
1061
1062 if (wakeup)
1063 flags |= ENQUEUE_WAKEUP;
1064 if (p->state == TASK_WAKING)
1065 flags |= ENQUEUE_MIGRATE;
1066 1049
1067 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1068 if (se->on_rq) 1051 if (se->on_rq)
@@ -1080,18 +1063,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
1080 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1081 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1082 */ 1065 */
1083static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1084{ 1067{
1085 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1086 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1087 1070
1088 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1089 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1090 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1091 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1092 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1093 break; 1076 break;
1094 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1095 } 1078 }
1096 1079
1097 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1239,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1239 1222
1240static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1241{ 1224{
1242 struct task_struct *curr = current;
1243 unsigned long this_load, load; 1225 unsigned long this_load, load;
1244 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1245 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
1246 unsigned int imbalance;
1247 struct task_group *tg; 1228 struct task_group *tg;
1248 unsigned long weight; 1229 unsigned long weight;
1249 int balanced; 1230 int balanced;
@@ -1254,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1254 load = source_load(prev_cpu, idx); 1235 load = source_load(prev_cpu, idx);
1255 this_load = target_load(this_cpu, idx); 1236 this_load = target_load(this_cpu, idx);
1256 1237
1257 if (sync) {
1258 if (sched_feat(SYNC_LESS) &&
1259 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1260 p->se.avg_overlap > sysctl_sched_migration_cost))
1261 sync = 0;
1262 } else {
1263 if (sched_feat(SYNC_MORE) &&
1264 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1265 p->se.avg_overlap < sysctl_sched_migration_cost))
1266 sync = 1;
1267 }
1268
1269 /* 1238 /*
1270 * If sync wakeup then subtract the (maximum possible) 1239 * If sync wakeup then subtract the (maximum possible)
1271 * effect of the currently running task from the load 1240 * effect of the currently running task from the load
1272 * of the current CPU: 1241 * of the current CPU:
1273 */ 1242 */
1243 rcu_read_lock();
1274 if (sync) { 1244 if (sync) {
1275 tg = task_group(current); 1245 tg = task_group(current);
1276 weight = current->se.load.weight; 1246 weight = current->se.load.weight;
@@ -1282,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1282 tg = task_group(p); 1252 tg = task_group(p);
1283 weight = p->se.load.weight; 1253 weight = p->se.load.weight;
1284 1254
1285 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1286
1287 /* 1255 /*
1288 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1256 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1289 * due to the sync cause above having dropped this_load to 0, we'll 1257 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1293,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1293 * Otherwise check if either cpus are near enough in load to allow this 1261 * Otherwise check if either cpus are near enough in load to allow this
1294 * task to be woken on this_cpu. 1262 * task to be woken on this_cpu.
1295 */ 1263 */
1296 balanced = !this_load || 1264 if (this_load) {
1297 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= 1265 unsigned long this_eff_load, prev_eff_load;
1298 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1266
1267 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu);
1269 this_eff_load *= this_load +
1270 effective_load(tg, this_cpu, weight, weight);
1271
1272 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1273 prev_eff_load *= power_of(this_cpu);
1274 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1275
1276 balanced = this_eff_load <= prev_eff_load;
1277 } else
1278 balanced = true;
1279 rcu_read_unlock();
1299 1280
1300 /* 1281 /*
1301 * If the currently running task will sleep within 1282 * If the currently running task will sleep within
@@ -1305,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1305 if (sync && balanced) 1286 if (sync && balanced)
1306 return 1; 1287 return 1;
1307 1288
1308 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1289 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1309 tl_per_task = cpu_avg_load_per_task(this_cpu); 1290 tl_per_task = cpu_avg_load_per_task(this_cpu);
1310 1291
1311 if (balanced || 1292 if (balanced ||
@@ -1317,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1317 * there is no bad imbalance. 1298 * there is no bad imbalance.
1318 */ 1299 */
1319 schedstat_inc(sd, ttwu_move_affine); 1300 schedstat_inc(sd, ttwu_move_affine);
1320 schedstat_inc(p, se.nr_wakeups_affine); 1301 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1321 1302
1322 return 1; 1303 return 1;
1323 } 1304 }
@@ -1405,29 +1386,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1405/* 1386/*
1406 * Try and locate an idle CPU in the sched_domain. 1387 * Try and locate an idle CPU in the sched_domain.
1407 */ 1388 */
1408static int 1389static int select_idle_sibling(struct task_struct *p, int target)
1409select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1410{ 1390{
1411 int cpu = smp_processor_id(); 1391 int cpu = smp_processor_id();
1412 int prev_cpu = task_cpu(p); 1392 int prev_cpu = task_cpu(p);
1393 struct sched_domain *sd;
1413 int i; 1394 int i;
1414 1395
1415 /* 1396 /*
1416 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1397 * If the task is going to be woken-up on this cpu and if it is
1417 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1398 * already idle, then it is the right target.
1418 * always a better target than the current cpu.
1419 */ 1399 */
1420 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1400 if (target == cpu && idle_cpu(cpu))
1401 return cpu;
1402
1403 /*
1404 * If the task is going to be woken-up on the cpu where it previously
1405 * ran and if it is currently idle, then it the right target.
1406 */
1407 if (target == prev_cpu && idle_cpu(prev_cpu))
1421 return prev_cpu; 1408 return prev_cpu;
1422 1409
1423 /* 1410 /*
1424 * Otherwise, iterate the domain and find an elegible idle cpu. 1411 * Otherwise, iterate the domains and find an elegible idle cpu.
1425 */ 1412 */
1426 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1413 for_each_domain(target, sd) {
1427 if (!cpu_rq(i)->cfs.nr_running) { 1414 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1428 target = i;
1429 break; 1415 break;
1416
1417 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1418 if (idle_cpu(i)) {
1419 target = i;
1420 break;
1421 }
1430 } 1422 }
1423
1424 /*
1425 * Lets stop looking for an idle sibling when we reached
1426 * the domain that spans the current cpu and prev_cpu.
1427 */
1428 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1429 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1430 break;
1431 } 1431 }
1432 1432
1433 return target; 1433 return target;
@@ -1444,7 +1444,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1444 * 1444 *
1445 * preempt must be disabled. 1445 * preempt must be disabled.
1446 */ 1446 */
1447static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1447static int
1448select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1448{ 1449{
1449 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1450 int cpu = smp_processor_id(); 1451 int cpu = smp_processor_id();
@@ -1455,8 +1456,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1455 int sync = wake_flags & WF_SYNC; 1456 int sync = wake_flags & WF_SYNC;
1456 1457
1457 if (sd_flag & SD_BALANCE_WAKE) { 1458 if (sd_flag & SD_BALANCE_WAKE) {
1458 if (sched_feat(AFFINE_WAKEUPS) && 1459 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1459 cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 want_affine = 1; 1460 want_affine = 1;
1461 new_cpu = prev_cpu; 1461 new_cpu = prev_cpu;
1462 } 1462 }
@@ -1490,34 +1490,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1490 } 1490 }
1491 1491
1492 /* 1492 /*
1493 * While iterating the domains looking for a spanning 1493 * If both cpu and prev_cpu are part of this domain,
1494 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1494 * cpu is a valid SD_WAKE_AFFINE target.
1495 * in cache sharing domains along the way.
1496 */ 1495 */
1497 if (want_affine) { 1496 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1498 int target = -1; 1497 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1499 1498 affine_sd = tmp;
1500 /* 1499 want_affine = 0;
1501 * If both cpu and prev_cpu are part of this domain,
1502 * cpu is a valid SD_WAKE_AFFINE target.
1503 */
1504 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1505 target = cpu;
1506
1507 /*
1508 * If there's an idle sibling in this domain, make that
1509 * the wake_affine target instead of the current cpu.
1510 */
1511 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1512 target = select_idle_sibling(p, tmp, target);
1513
1514 if (target >= 0) {
1515 if (tmp->flags & SD_WAKE_AFFINE) {
1516 affine_sd = tmp;
1517 want_affine = 0;
1518 }
1519 cpu = target;
1520 }
1521 } 1500 }
1522 1501
1523 if (!want_sd && !want_affine) 1502 if (!want_sd && !want_affine)
@@ -1530,22 +1509,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1530 sd = tmp; 1509 sd = tmp;
1531 } 1510 }
1532 1511
1512#ifdef CONFIG_FAIR_GROUP_SCHED
1533 if (sched_feat(LB_SHARES_UPDATE)) { 1513 if (sched_feat(LB_SHARES_UPDATE)) {
1534 /* 1514 /*
1535 * Pick the largest domain to update shares over 1515 * Pick the largest domain to update shares over
1536 */ 1516 */
1537 tmp = sd; 1517 tmp = sd;
1538 if (affine_sd && (!tmp || 1518 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1539 cpumask_weight(sched_domain_span(affine_sd)) >
1540 cpumask_weight(sched_domain_span(sd))))
1541 tmp = affine_sd; 1519 tmp = affine_sd;
1542 1520
1543 if (tmp) 1521 if (tmp) {
1522 raw_spin_unlock(&rq->lock);
1544 update_shares(tmp); 1523 update_shares(tmp);
1524 raw_spin_lock(&rq->lock);
1525 }
1545 } 1526 }
1527#endif
1546 1528
1547 if (affine_sd && wake_affine(affine_sd, p, sync)) 1529 if (affine_sd) {
1548 return cpu; 1530 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1531 return select_idle_sibling(p, cpu);
1532 else
1533 return select_idle_sibling(p, prev_cpu);
1534 }
1549 1535
1550 while (sd) { 1536 while (sd) {
1551 int load_idx = sd->forkexec_idx; 1537 int load_idx = sd->forkexec_idx;
@@ -1575,10 +1561,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1575 1561
1576 /* Now try balancing at a lower domain level of new_cpu */ 1562 /* Now try balancing at a lower domain level of new_cpu */
1577 cpu = new_cpu; 1563 cpu = new_cpu;
1578 weight = cpumask_weight(sched_domain_span(sd)); 1564 weight = sd->span_weight;
1579 sd = NULL; 1565 sd = NULL;
1580 for_each_domain(cpu, tmp) { 1566 for_each_domain(cpu, tmp) {
1581 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1567 if (weight <= tmp->span_weight)
1582 break; 1568 break;
1583 if (tmp->flags & sd_flag) 1569 if (tmp->flags & sd_flag)
1584 sd = tmp; 1570 sd = tmp;
@@ -1590,63 +1576,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1590} 1576}
1591#endif /* CONFIG_SMP */ 1577#endif /* CONFIG_SMP */
1592 1578
1593/*
1594 * Adaptive granularity
1595 *
1596 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1597 * with the limit of wakeup_gran -- when it never does a wakeup.
1598 *
1599 * So the smaller avg_wakeup is the faster we want this task to preempt,
1600 * but we don't want to treat the preemptee unfairly and therefore allow it
1601 * to run for at least the amount of time we'd like to run.
1602 *
1603 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1604 *
1605 * NOTE: we use *nr_running to scale with load, this nicely matches the
1606 * degrading latency on load.
1607 */
1608static unsigned long
1609adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1610{
1611 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1612 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1613 u64 gran = 0;
1614
1615 if (this_run < expected_wakeup)
1616 gran = expected_wakeup - this_run;
1617
1618 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1619}
1620
1621static unsigned long 1579static unsigned long
1622wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1580wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1623{ 1581{
1624 unsigned long gran = sysctl_sched_wakeup_granularity; 1582 unsigned long gran = sysctl_sched_wakeup_granularity;
1625 1583
1626 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1627 gran = adaptive_gran(curr, se);
1628
1629 /* 1584 /*
1630 * Since its curr running now, convert the gran from real-time 1585 * Since its curr running now, convert the gran from real-time
1631 * to virtual-time in his units. 1586 * to virtual-time in his units.
1587 *
1588 * By using 'se' instead of 'curr' we penalize light tasks, so
1589 * they get preempted easier. That is, if 'se' < 'curr' then
1590 * the resulting gran will be larger, therefore penalizing the
1591 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1592 * be smaller, again penalizing the lighter task.
1593 *
1594 * This is especially important for buddies when the leftmost
1595 * task is higher priority than the buddy.
1632 */ 1596 */
1633 if (sched_feat(ASYM_GRAN)) { 1597 if (unlikely(se->load.weight != NICE_0_LOAD))
1634 /* 1598 gran = calc_delta_fair(gran, se);
1635 * By using 'se' instead of 'curr' we penalize light tasks, so
1636 * they get preempted easier. That is, if 'se' < 'curr' then
1637 * the resulting gran will be larger, therefore penalizing the
1638 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1639 * be smaller, again penalizing the lighter task.
1640 *
1641 * This is especially important for buddies when the leftmost
1642 * task is higher priority than the buddy.
1643 */
1644 if (unlikely(se->load.weight != NICE_0_LOAD))
1645 gran = calc_delta_fair(gran, se);
1646 } else {
1647 if (unlikely(curr->load.weight != NICE_0_LOAD))
1648 gran = calc_delta_fair(gran, curr);
1649 }
1650 1599
1651 return gran; 1600 return gran;
1652} 1601}
@@ -1704,7 +1653,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1704 struct task_struct *curr = rq->curr; 1653 struct task_struct *curr = rq->curr;
1705 struct sched_entity *se = &curr->se, *pse = &p->se; 1654 struct sched_entity *se = &curr->se, *pse = &p->se;
1706 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1655 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1707 int sync = wake_flags & WF_SYNC;
1708 int scale = cfs_rq->nr_running >= sched_nr_latency; 1656 int scale = cfs_rq->nr_running >= sched_nr_latency;
1709 1657
1710 if (unlikely(rt_prio(p->prio))) 1658 if (unlikely(rt_prio(p->prio)))
@@ -1737,14 +1685,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1737 if (unlikely(curr->policy == SCHED_IDLE)) 1685 if (unlikely(curr->policy == SCHED_IDLE))
1738 goto preempt; 1686 goto preempt;
1739 1687
1740 if (sched_feat(WAKEUP_SYNC) && sync)
1741 goto preempt;
1742
1743 if (sched_feat(WAKEUP_OVERLAP) &&
1744 se->avg_overlap < sysctl_sched_migration_cost &&
1745 pse->avg_overlap < sysctl_sched_migration_cost)
1746 goto preempt;
1747
1748 if (!sched_feat(WAKEUP_PREEMPT)) 1688 if (!sched_feat(WAKEUP_PREEMPT))
1749 return; 1689 return;
1750 1690
@@ -1815,57 +1755,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1755 */
1816 1756
1817/* 1757/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1758 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1759 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1760 */
1824static struct task_struct * 1761static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1762 struct rq *this_rq, int this_cpu)
1826{ 1763{
1827 struct task_struct *p = NULL; 1764 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1765 set_task_cpu(p, this_cpu);
1766 activate_task(this_rq, p, 0);
1767 check_preempt_curr(this_rq, p, 0);
1768}
1829 1769
1830 if (next == &cfs_rq->tasks) 1770/*
1831 return NULL; 1771 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1772 */
1773static
1774int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1775 struct sched_domain *sd, enum cpu_idle_type idle,
1776 int *all_pinned)
1777{
1778 int tsk_cache_hot = 0;
1779 /*
1780 * We do not migrate tasks that are:
1781 * 1) running (obviously), or
1782 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1783 * 3) are cache-hot on their current CPU.
1784 */
1785 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1786 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1787 return 0;
1788 }
1789 *all_pinned = 0;
1832 1790
1833 se = list_entry(next, struct sched_entity, group_node); 1791 if (task_running(rq, p)) {
1834 p = task_of(se); 1792 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1793 return 0;
1794 }
1836 1795
1837 return p; 1796 /*
1838} 1797 * Aggressive migration if:
1798 * 1) task is cache cold, or
1799 * 2) too many balance attempts have failed.
1800 */
1839 1801
1840static struct task_struct *load_balance_start_fair(void *arg) 1802 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1803 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1804 sd->nr_balance_failed > sd->cache_nice_tries) {
1805#ifdef CONFIG_SCHEDSTATS
1806 if (tsk_cache_hot) {
1807 schedstat_inc(sd, lb_hot_gained[idle]);
1808 schedstat_inc(p, se.statistics.nr_forced_migrations);
1809 }
1810#endif
1811 return 1;
1812 }
1843 1813
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1814 if (tsk_cache_hot) {
1815 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1816 return 0;
1817 }
1818 return 1;
1845} 1819}
1846 1820
1847static struct task_struct *load_balance_next_fair(void *arg) 1821/*
1822 * move_one_task tries to move exactly one task from busiest to this_rq, as
1823 * part of active balancing operations within "domain".
1824 * Returns 1 if successful and 0 otherwise.
1825 *
1826 * Called with both runqueues locked.
1827 */
1828static int
1829move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1830 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1831{
1849 struct cfs_rq *cfs_rq = arg; 1832 struct task_struct *p, *n;
1833 struct cfs_rq *cfs_rq;
1834 int pinned = 0;
1850 1835
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1836 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1837 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1838
1839 if (!can_migrate_task(p, busiest, this_cpu,
1840 sd, idle, &pinned))
1841 continue;
1842
1843 pull_task(busiest, p, this_rq, this_cpu);
1844 /*
1845 * Right now, this is only the second place pull_task()
1846 * is called, so we can safely collect pull_task()
1847 * stats here rather than inside pull_task().
1848 */
1849 schedstat_inc(sd, lb_gained[idle]);
1850 return 1;
1851 }
1852 }
1853
1854 return 0;
1852} 1855}
1853 1856
1854static unsigned long 1857static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1858balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1859 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1860 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1861 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1862{
1860 struct rq_iterator cfs_rq_iterator; 1863 int loops = 0, pulled = 0, pinned = 0;
1864 long rem_load_move = max_load_move;
1865 struct task_struct *p, *n;
1861 1866
1862 cfs_rq_iterator.start = load_balance_start_fair; 1867 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1868 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1869
1866 return balance_tasks(this_rq, this_cpu, busiest, 1870 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1871
1868 this_best_prio, &cfs_rq_iterator); 1872 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1873 if (loops++ > sysctl_sched_nr_migrate)
1874 break;
1875
1876 if ((p->se.load.weight >> 1) > rem_load_move ||
1877 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1878 continue;
1879
1880 pull_task(busiest, p, this_rq, this_cpu);
1881 pulled++;
1882 rem_load_move -= p->se.load.weight;
1883
1884#ifdef CONFIG_PREEMPT
1885 /*
1886 * NEWIDLE balancing is a source of latency, so preemptible
1887 * kernels will stop after the first task is pulled to minimize
1888 * the critical section.
1889 */
1890 if (idle == CPU_NEWLY_IDLE)
1891 break;
1892#endif
1893
1894 /*
1895 * We only want to steal up to the prescribed amount of
1896 * weighted load.
1897 */
1898 if (rem_load_move <= 0)
1899 break;
1900
1901 if (p->prio < *this_best_prio)
1902 *this_best_prio = p->prio;
1903 }
1904out:
1905 /*
1906 * Right now, this is one of only two places pull_task() is called,
1907 * so we can safely collect pull_task() stats here rather than
1908 * inside pull_task().
1909 */
1910 schedstat_add(sd, lb_gained[idle], pulled);
1911
1912 if (all_pinned)
1913 *all_pinned = pinned;
1914
1915 return max_load_move - rem_load_move;
1869} 1916}
1870 1917
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1918#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +1944,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 1944 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 1945 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 1946
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 1947 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 1948 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 1949 busiest_cfs_rq);
1903 1950
1904 if (!moved_load) 1951 if (!moved_load)
1905 continue; 1952 continue;
@@ -1922,35 +1969,1528 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 1969 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 1970 int *all_pinned, int *this_best_prio)
1924{ 1971{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 1972 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 1973 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 1974 this_best_prio, &busiest->cfs);
1928} 1975}
1929#endif 1976#endif
1930 1977
1931static int 1978/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1979 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 1980 * this_rq, as part of a balancing operation within domain "sd".
1981 * Returns 1 if successful and 0 otherwise.
1982 *
1983 * Called with both runqueues locked.
1984 */
1985static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1986 unsigned long max_load_move,
1987 struct sched_domain *sd, enum cpu_idle_type idle,
1988 int *all_pinned)
1989{
1990 unsigned long total_load_moved = 0, load_moved;
1991 int this_best_prio = this_rq->curr->prio;
1992
1993 do {
1994 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
1995 max_load_move - total_load_moved,
1996 sd, idle, all_pinned, &this_best_prio);
1997
1998 total_load_moved += load_moved;
1999
2000#ifdef CONFIG_PREEMPT
2001 /*
2002 * NEWIDLE balancing is a source of latency, so preemptible
2003 * kernels will stop after the first task is pulled to minimize
2004 * the critical section.
2005 */
2006 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2007 break;
2008
2009 if (raw_spin_is_contended(&this_rq->lock) ||
2010 raw_spin_is_contended(&busiest->lock))
2011 break;
2012#endif
2013 } while (load_moved && max_load_move > total_load_moved);
2014
2015 return total_load_moved > 0;
2016}
2017
2018/********** Helpers for find_busiest_group ************************/
2019/*
2020 * sd_lb_stats - Structure to store the statistics of a sched_domain
2021 * during load balancing.
2022 */
2023struct sd_lb_stats {
2024 struct sched_group *busiest; /* Busiest group in this sd */
2025 struct sched_group *this; /* Local group in this sd */
2026 unsigned long total_load; /* Total load of all groups in sd */
2027 unsigned long total_pwr; /* Total power of all groups in sd */
2028 unsigned long avg_load; /* Average load across all groups in sd */
2029
2030 /** Statistics of this group */
2031 unsigned long this_load;
2032 unsigned long this_load_per_task;
2033 unsigned long this_nr_running;
2034
2035 /* Statistics of the busiest group */
2036 unsigned long max_load;
2037 unsigned long busiest_load_per_task;
2038 unsigned long busiest_nr_running;
2039 unsigned long busiest_group_capacity;
2040
2041 int group_imb; /* Is there imbalance in this sd */
2042#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2043 int power_savings_balance; /* Is powersave balance needed for this sd */
2044 struct sched_group *group_min; /* Least loaded group in sd */
2045 struct sched_group *group_leader; /* Group which relieves group_min */
2046 unsigned long min_load_per_task; /* load_per_task in group_min */
2047 unsigned long leader_nr_running; /* Nr running of group_leader */
2048 unsigned long min_nr_running; /* Nr running of group_min */
2049#endif
2050};
2051
2052/*
2053 * sg_lb_stats - stats of a sched_group required for load_balancing
2054 */
2055struct sg_lb_stats {
2056 unsigned long avg_load; /*Avg load across the CPUs of the group */
2057 unsigned long group_load; /* Total load over the CPUs of the group */
2058 unsigned long sum_nr_running; /* Nr tasks running in the group */
2059 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2060 unsigned long group_capacity;
2061 int group_imb; /* Is there an imbalance in the group ? */
2062};
2063
2064/**
2065 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2066 * @group: The group whose first cpu is to be returned.
2067 */
2068static inline unsigned int group_first_cpu(struct sched_group *group)
2069{
2070 return cpumask_first(sched_group_cpus(group));
2071}
2072
2073/**
2074 * get_sd_load_idx - Obtain the load index for a given sched domain.
2075 * @sd: The sched_domain whose load_idx is to be obtained.
2076 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2077 */
2078static inline int get_sd_load_idx(struct sched_domain *sd,
2079 enum cpu_idle_type idle)
2080{
2081 int load_idx;
2082
2083 switch (idle) {
2084 case CPU_NOT_IDLE:
2085 load_idx = sd->busy_idx;
2086 break;
2087
2088 case CPU_NEWLY_IDLE:
2089 load_idx = sd->newidle_idx;
2090 break;
2091 default:
2092 load_idx = sd->idle_idx;
2093 break;
2094 }
2095
2096 return load_idx;
2097}
2098
2099
2100#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2101/**
2102 * init_sd_power_savings_stats - Initialize power savings statistics for
2103 * the given sched_domain, during load balancing.
2104 *
2105 * @sd: Sched domain whose power-savings statistics are to be initialized.
2106 * @sds: Variable containing the statistics for sd.
2107 * @idle: Idle status of the CPU at which we're performing load-balancing.
2108 */
2109static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2110 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2111{
2112 /*
2113 * Busy processors will not participate in power savings
2114 * balance.
2115 */
2116 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2117 sds->power_savings_balance = 0;
2118 else {
2119 sds->power_savings_balance = 1;
2120 sds->min_nr_running = ULONG_MAX;
2121 sds->leader_nr_running = 0;
2122 }
2123}
2124
2125/**
2126 * update_sd_power_savings_stats - Update the power saving stats for a
2127 * sched_domain while performing load balancing.
2128 *
2129 * @group: sched_group belonging to the sched_domain under consideration.
2130 * @sds: Variable containing the statistics of the sched_domain
2131 * @local_group: Does group contain the CPU for which we're performing
2132 * load balancing ?
2133 * @sgs: Variable containing the statistics of the group.
2134 */
2135static inline void update_sd_power_savings_stats(struct sched_group *group,
2136 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2137{
2138
2139 if (!sds->power_savings_balance)
2140 return;
2141
2142 /*
2143 * If the local group is idle or completely loaded
2144 * no need to do power savings balance at this domain
2145 */
2146 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2147 !sds->this_nr_running))
2148 sds->power_savings_balance = 0;
2149
2150 /*
2151 * If a group is already running at full capacity or idle,
2152 * don't include that group in power savings calculations
2153 */
2154 if (!sds->power_savings_balance ||
2155 sgs->sum_nr_running >= sgs->group_capacity ||
2156 !sgs->sum_nr_running)
2157 return;
2158
2159 /*
2160 * Calculate the group which has the least non-idle load.
2161 * This is the group from where we need to pick up the load
2162 * for saving power
2163 */
2164 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2165 (sgs->sum_nr_running == sds->min_nr_running &&
2166 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2167 sds->group_min = group;
2168 sds->min_nr_running = sgs->sum_nr_running;
2169 sds->min_load_per_task = sgs->sum_weighted_load /
2170 sgs->sum_nr_running;
2171 }
2172
2173 /*
2174 * Calculate the group which is almost near its
2175 * capacity but still has some space to pick up some load
2176 * from other group and save more power
2177 */
2178 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2179 return;
2180
2181 if (sgs->sum_nr_running > sds->leader_nr_running ||
2182 (sgs->sum_nr_running == sds->leader_nr_running &&
2183 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2184 sds->group_leader = group;
2185 sds->leader_nr_running = sgs->sum_nr_running;
2186 }
2187}
2188
2189/**
2190 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * under consideration.
2193 * @this_cpu: Cpu at which we're currently performing load-balancing.
2194 * @imbalance: Variable to store the imbalance.
2195 *
2196 * Description:
2197 * Check if we have potential to perform some power-savings balance.
2198 * If yes, set the busiest group to be the least loaded group in the
2199 * sched_domain, so that it's CPUs can be put to idle.
2200 *
2201 * Returns 1 if there is potential to perform power-savings balance.
2202 * Else returns 0.
2203 */
2204static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2205 int this_cpu, unsigned long *imbalance)
2206{
2207 if (!sds->power_savings_balance)
2208 return 0;
2209
2210 if (sds->this != sds->group_leader ||
2211 sds->group_leader == sds->group_min)
2212 return 0;
2213
2214 *imbalance = sds->min_load_per_task;
2215 sds->busiest = sds->group_min;
2216
2217 return 1;
2218
2219}
2220#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2221static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2222 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2223{
2224 return;
2225}
2226
2227static inline void update_sd_power_savings_stats(struct sched_group *group,
2228 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2229{
2230 return;
2231}
2232
2233static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2234 int this_cpu, unsigned long *imbalance)
2235{
2236 return 0;
2237}
2238#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2239
2240
2241unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2242{
2243 return SCHED_LOAD_SCALE;
2244}
2245
2246unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2247{
2248 return default_scale_freq_power(sd, cpu);
2249}
2250
2251unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2252{
2253 unsigned long weight = sd->span_weight;
2254 unsigned long smt_gain = sd->smt_gain;
2255
2256 smt_gain /= weight;
2257
2258 return smt_gain;
2259}
2260
2261unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2262{
2263 return default_scale_smt_power(sd, cpu);
2264}
2265
2266unsigned long scale_rt_power(int cpu)
2267{
2268 struct rq *rq = cpu_rq(cpu);
2269 u64 total, available;
2270
2271 sched_avg_update(rq);
2272
2273 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2274 available = total - rq->rt_avg;
2275
2276 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2277 total = SCHED_LOAD_SCALE;
2278
2279 total >>= SCHED_LOAD_SHIFT;
2280
2281 return div_u64(available, total);
2282}
2283
2284static void update_cpu_power(struct sched_domain *sd, int cpu)
2285{
2286 unsigned long weight = sd->span_weight;
2287 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups;
2289
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu);
2300 else
2301 power *= default_scale_smt_power(sd, cpu);
2302
2303 power >>= SCHED_LOAD_SHIFT;
2304 }
2305
2306 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT;
2308
2309 if (!power)
2310 power = 1;
2311
2312 cpu_rq(cpu)->cpu_power = power;
2313 sdg->cpu_power = power;
2314}
2315
2316static void update_group_power(struct sched_domain *sd, int cpu)
2317{
2318 struct sched_domain *child = sd->child;
2319 struct sched_group *group, *sdg = sd->groups;
2320 unsigned long power;
2321
2322 if (!child) {
2323 update_cpu_power(sd, cpu);
2324 return;
2325 }
2326
2327 power = 0;
2328
2329 group = child->groups;
2330 do {
2331 power += group->cpu_power;
2332 group = group->next;
2333 } while (group != child->groups);
2334
2335 sdg->cpu_power = power;
2336}
2337
2338/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated.
2341 * @group: sched_group whose statistics are to be updated.
2342 * @this_cpu: Cpu for which load balance is currently performed.
2343 * @idle: Idle status of this_cpu
2344 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2345 * @sd_idle: Idle status of the sched_domain containing group.
2346 * @local_group: Does group contain this_cpu.
2347 * @cpus: Set of cpus considered for load balancing.
2348 * @balance: Should we balance.
2349 * @sgs: variable to hold the statistics for this group.
2350 */
2351static inline void update_sg_lb_stats(struct sched_domain *sd,
2352 struct sched_group *group, int this_cpu,
2353 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2354 int local_group, const struct cpumask *cpus,
2355 int *balance, struct sg_lb_stats *sgs)
2356{
2357 unsigned long load, max_cpu_load, min_cpu_load;
2358 int i;
2359 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2360 unsigned long avg_load_per_task = 0;
2361
2362 if (local_group)
2363 balance_cpu = group_first_cpu(group);
2364
2365 /* Tally up the load of all CPUs in the group */
2366 max_cpu_load = 0;
2367 min_cpu_load = ~0UL;
2368
2369 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2370 struct rq *rq = cpu_rq(i);
2371
2372 if (*sd_idle && rq->nr_running)
2373 *sd_idle = 0;
2374
2375 /* Bias balancing toward cpus of our domain */
2376 if (local_group) {
2377 if (idle_cpu(i) && !first_idle_cpu) {
2378 first_idle_cpu = 1;
2379 balance_cpu = i;
2380 }
2381
2382 load = target_load(i, load_idx);
2383 } else {
2384 load = source_load(i, load_idx);
2385 if (load > max_cpu_load)
2386 max_cpu_load = load;
2387 if (min_cpu_load > load)
2388 min_cpu_load = load;
2389 }
2390
2391 sgs->group_load += load;
2392 sgs->sum_nr_running += rq->nr_running;
2393 sgs->sum_weighted_load += weighted_cpuload(i);
2394
2395 }
2396
2397 /*
2398 * First idle cpu or the first cpu(busiest) in this sched group
2399 * is eligible for doing load balancing at this and above
2400 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance.
2402 */
2403 if (idle != CPU_NEWLY_IDLE && local_group &&
2404 balance_cpu != this_cpu) {
2405 *balance = 0;
2406 return;
2407 }
2408
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413
2414 /*
2415 * Consider the group unbalanced when the imbalance is larger
2416 * than the average weight of two tasks.
2417 *
2418 * APZ: with cgroup the avg task weight can vary wildly and
2419 * might not be a suitable number - should we keep a
2420 * normalized nr_running number somewhere that negates
2421 * the hierarchy?
2422 */
2423 if (sgs->sum_nr_running)
2424 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2425
2426 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2427 sgs->group_imb = 1;
2428
2429 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2431}
2432
2433/**
2434 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2435 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group.
2439 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain.
2442 */
2443static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2444 enum cpu_idle_type idle, int *sd_idle,
2445 const struct cpumask *cpus, int *balance,
2446 struct sd_lb_stats *sds)
2447{
2448 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups;
2450 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0;
2452
2453 if (child && child->flags & SD_PREFER_SIBLING)
2454 prefer_sibling = 1;
2455
2456 init_sd_power_savings_stats(sd, sds, idle);
2457 load_idx = get_sd_load_idx(sd, idle);
2458
2459 do {
2460 int local_group;
2461
2462 local_group = cpumask_test_cpu(this_cpu,
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs);
2467
2468 if (local_group && !(*balance))
2469 return;
2470
2471 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power;
2473
2474 /*
2475 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try
2477 * and move all the excess tasks away.
2478 */
2479 if (prefer_sibling)
2480 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2481
2482 if (local_group) {
2483 sds->this_load = sgs.avg_load;
2484 sds->this = group;
2485 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load &&
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load;
2491 sds->busiest = group;
2492 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb;
2496 }
2497
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2499 group = group->next;
2500 } while (group != sd->groups);
2501}
2502
2503/**
2504 * fix_small_imbalance - Calculate the minor imbalance that exists
2505 * amongst the groups of a sched_domain, during
2506 * load balancing.
2507 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2508 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2509 * @imbalance: Variable to store the imbalance.
2510 */
2511static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2512 int this_cpu, unsigned long *imbalance)
2513{
2514 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2515 unsigned int imbn = 2;
2516 unsigned long scaled_busy_load_per_task;
2517
2518 if (sds->this_nr_running) {
2519 sds->this_load_per_task /= sds->this_nr_running;
2520 if (sds->busiest_load_per_task >
2521 sds->this_load_per_task)
2522 imbn = 1;
2523 } else
2524 sds->this_load_per_task =
2525 cpu_avg_load_per_task(this_cpu);
2526
2527 scaled_busy_load_per_task = sds->busiest_load_per_task
2528 * SCHED_LOAD_SCALE;
2529 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2530
2531 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2532 (scaled_busy_load_per_task * imbn)) {
2533 *imbalance = sds->busiest_load_per_task;
2534 return;
2535 }
2536
2537 /*
2538 * OK, we don't have enough imbalance to justify moving tasks,
2539 * however we may be able to increase total CPU power used by
2540 * moving them.
2541 */
2542
2543 pwr_now += sds->busiest->cpu_power *
2544 min(sds->busiest_load_per_task, sds->max_load);
2545 pwr_now += sds->this->cpu_power *
2546 min(sds->this_load_per_task, sds->this_load);
2547 pwr_now /= SCHED_LOAD_SCALE;
2548
2549 /* Amount of load we'd subtract */
2550 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2551 sds->busiest->cpu_power;
2552 if (sds->max_load > tmp)
2553 pwr_move += sds->busiest->cpu_power *
2554 min(sds->busiest_load_per_task, sds->max_load - tmp);
2555
2556 /* Amount of load we'd add */
2557 if (sds->max_load * sds->busiest->cpu_power <
2558 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2559 tmp = (sds->max_load * sds->busiest->cpu_power) /
2560 sds->this->cpu_power;
2561 else
2562 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2563 sds->this->cpu_power;
2564 pwr_move += sds->this->cpu_power *
2565 min(sds->this_load_per_task, sds->this_load + tmp);
2566 pwr_move /= SCHED_LOAD_SCALE;
2567
2568 /* Move if we gain throughput */
2569 if (pwr_move > pwr_now)
2570 *imbalance = sds->busiest_load_per_task;
2571}
2572
2573/**
2574 * calculate_imbalance - Calculate the amount of imbalance present within the
2575 * groups of a given sched_domain during load balance.
2576 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2577 * @this_cpu: Cpu for which currently load balance is being performed.
2578 * @imbalance: The variable to store the imbalance.
2579 */
2580static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2581 unsigned long *imbalance)
2582{
2583 unsigned long max_pull, load_above_capacity = ~0UL;
2584
2585 sds->busiest_load_per_task /= sds->busiest_nr_running;
2586 if (sds->group_imb) {
2587 sds->busiest_load_per_task =
2588 min(sds->busiest_load_per_task, sds->avg_load);
2589 }
2590
2591 /*
2592 * In the presence of smp nice balancing, certain scenarios can have
2593 * max load less than avg load(as we skip the groups at or below
2594 * its cpu_power, while calculating max_load..)
2595 */
2596 if (sds->max_load < sds->avg_load) {
2597 *imbalance = 0;
2598 return fix_small_imbalance(sds, this_cpu, imbalance);
2599 }
2600
2601 if (!sds->group_imb) {
2602 /*
2603 * Don't want to pull so many tasks that a group would go idle.
2604 */
2605 load_above_capacity = (sds->busiest_nr_running -
2606 sds->busiest_group_capacity);
2607
2608 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2609
2610 load_above_capacity /= sds->busiest->cpu_power;
2611 }
2612
2613 /*
2614 * We're trying to get all the cpus to the average_load, so we don't
2615 * want to push ourselves above the average load, nor do we wish to
2616 * reduce the max loaded cpu below the average load. At the same time,
2617 * we also don't want to reduce the group load below the group capacity
2618 * (so that we can implement power-savings policies etc). Thus we look
2619 * for the minimum possible imbalance.
2620 * Be careful of negative numbers as they'll appear as very large values
2621 * with unsigned longs.
2622 */
2623 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2624
2625 /* How much load to actually move to equalise the imbalance */
2626 *imbalance = min(max_pull * sds->busiest->cpu_power,
2627 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2628 / SCHED_LOAD_SCALE;
2629
2630 /*
2631 * if *imbalance is less than the average load per runnable task
2632 * there is no gaurantee that any tasks will be moved so we'll have
2633 * a think about bumping its value to force at least one task to be
2634 * moved
2635 */
2636 if (*imbalance < sds->busiest_load_per_task)
2637 return fix_small_imbalance(sds, this_cpu, imbalance);
2638
2639}
2640/******* find_busiest_group() helpers end here *********************/
2641
2642/**
2643 * find_busiest_group - Returns the busiest group within the sched_domain
2644 * if there is an imbalance. If there isn't an imbalance, and
2645 * the user has opted for power-savings, it returns a group whose
2646 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2647 * such a group exists.
2648 *
2649 * Also calculates the amount of weighted load which should be moved
2650 * to restore balance.
2651 *
2652 * @sd: The sched_domain whose busiest group is to be returned.
2653 * @this_cpu: The cpu for which load balancing is currently being performed.
2654 * @imbalance: Variable which stores amount of weighted load which should
2655 * be moved to restore balance/put a group to idle.
2656 * @idle: The idle status of this_cpu.
2657 * @sd_idle: The idleness of sd
2658 * @cpus: The set of CPUs under consideration for load-balancing.
2659 * @balance: Pointer to a variable indicating if this_cpu
2660 * is the appropriate cpu to perform load balancing at this_level.
2661 *
2662 * Returns: - the busiest group if imbalance exists.
2663 * - If no imbalance and user has opted for power-savings balance,
2664 * return the least loaded group whose CPUs can be
2665 * put to idle by rebalancing its tasks onto our group.
2666 */
2667static struct sched_group *
2668find_busiest_group(struct sched_domain *sd, int this_cpu,
2669 unsigned long *imbalance, enum cpu_idle_type idle,
2670 int *sd_idle, const struct cpumask *cpus, int *balance)
2671{
2672 struct sd_lb_stats sds;
2673
2674 memset(&sds, 0, sizeof(sds));
2675
2676 /*
2677 * Compute the various statistics relavent for load balancing at
2678 * this level.
2679 */
2680 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2681 balance, &sds);
2682
2683 /* Cases where imbalance does not exist from POV of this_cpu */
2684 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2685 * at this level.
2686 * 2) There is no busy sibling group to pull from.
2687 * 3) This group is the busiest group.
2688 * 4) This group is more busy than the avg busieness at this
2689 * sched_domain.
2690 * 5) The imbalance is within the specified limit.
2691 */
2692 if (!(*balance))
2693 goto ret;
2694
2695 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced;
2697
2698 if (sds.this_load >= sds.max_load)
2699 goto out_balanced;
2700
2701 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2702
2703 if (sds.this_load >= sds.avg_load)
2704 goto out_balanced;
2705
2706 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2707 goto out_balanced;
2708
2709 /* Looks like there is an imbalance. Compute it */
2710 calculate_imbalance(&sds, this_cpu, imbalance);
2711 return sds.busiest;
2712
2713out_balanced:
2714 /*
2715 * There is no obvious imbalance. But check if we can do some balancing
2716 * to save power.
2717 */
2718 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2719 return sds.busiest;
2720ret:
2721 *imbalance = 0;
2722 return NULL;
2723}
2724
2725/*
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */
2728static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2730 unsigned long imbalance, const struct cpumask *cpus)
2731{
2732 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0;
2734 int i;
2735
2736 for_each_cpu(i, sched_group_cpus(group)) {
2737 unsigned long power = power_of(i);
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl;
2740
2741 if (!cpumask_test_cpu(i, cpus))
2742 continue;
2743
2744 rq = cpu_rq(i);
2745 wl = weighted_cpuload(i);
2746
2747 /*
2748 * When comparing with imbalance, use weighted_cpuload()
2749 * which is not scaled with the cpu power.
2750 */
2751 if (capacity && rq->nr_running == 1 && wl > imbalance)
2752 continue;
2753
2754 /*
2755 * For the load comparisons with the other cpu's, consider
2756 * the weighted_cpuload() scaled with the cpu power, so that
2757 * the load can be moved away from the cpu that is potentially
2758 * running at a lower capacity.
2759 */
2760 wl = (wl * SCHED_LOAD_SCALE) / power;
2761
2762 if (wl > max_load) {
2763 max_load = wl;
2764 busiest = rq;
2765 }
2766 }
2767
2768 return busiest;
2769}
2770
2771/*
2772 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2773 * so long as it is large enough.
2774 */
2775#define MAX_PINNED_INTERVAL 512
2776
2777/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2781{
2782 if (idle == CPU_NEWLY_IDLE) {
2783 /*
2784 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU
2786 * package.
2787 *
2788 * The package power saving logic comes from
2789 * find_busiest_group(). If there are no imbalance, then
2790 * f_b_g() will return NULL. However when sched_mc={1,2} then
2791 * f_b_g() will select a group from which a running task may be
2792 * pulled to this cpu in order to make the other package idle.
2793 * If there is no opportunity to make a package idle and if
2794 * there are no imbalance, then f_b_g() will return NULL and no
2795 * action will be taken in load_balance_newidle().
2796 *
2797 * Under normal task pull operation due to imbalance, there
2798 * will be more than one task in the source run queue and
2799 * move_tasks() will succeed. ld_moved will be true and this
2800 * active balance code will not be triggered.
2801 */
2802 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2803 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2804 return 0;
2805
2806 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2807 return 0;
2808 }
2809
2810 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2811}
2812
2813static int active_load_balance_cpu_stop(void *data);
2814
2815/*
2816 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2817 * tasks if there is an imbalance.
2818 */
2819static int load_balance(int this_cpu, struct rq *this_rq,
2820 struct sched_domain *sd, enum cpu_idle_type idle,
2821 int *balance)
2822{
2823 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2824 struct sched_group *group;
2825 unsigned long imbalance;
2826 struct rq *busiest;
2827 unsigned long flags;
2828 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2829
2830 cpumask_copy(cpus, cpu_active_mask);
2831
2832 /*
2833 * When power savings policy is enabled for the parent domain, idle
2834 * sibling can pick up load irrespective of busy siblings. In this case,
2835 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2836 * portraying it as CPU_NOT_IDLE.
2837 */
2838 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2839 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2840 sd_idle = 1;
2841
2842 schedstat_inc(sd, lb_count[idle]);
2843
2844redo:
2845 update_shares(sd);
2846 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2847 cpus, balance);
2848
2849 if (*balance == 0)
2850 goto out_balanced;
2851
2852 if (!group) {
2853 schedstat_inc(sd, lb_nobusyg[idle]);
2854 goto out_balanced;
2855 }
2856
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2858 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced;
2861 }
2862
2863 BUG_ON(busiest == this_rq);
2864
2865 schedstat_add(sd, lb_imbalance[idle], imbalance);
2866
2867 ld_moved = 0;
2868 if (busiest->nr_running > 1) {
2869 /*
2870 * Attempt to move tasks. If find_busiest_group has found
2871 * an imbalance but busiest->nr_running <= 1, the group is
2872 * still unbalanced. ld_moved simply stays zero, so it is
2873 * correctly treated as an imbalance.
2874 */
2875 local_irq_save(flags);
2876 double_rq_lock(this_rq, busiest);
2877 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2878 imbalance, sd, idle, &all_pinned);
2879 double_rq_unlock(this_rq, busiest);
2880 local_irq_restore(flags);
2881
2882 /*
2883 * some other cpu did the load balance for us.
2884 */
2885 if (ld_moved && this_cpu != smp_processor_id())
2886 resched_cpu(this_cpu);
2887
2888 /* All tasks on this runqueue were pinned by CPU affinity */
2889 if (unlikely(all_pinned)) {
2890 cpumask_clear_cpu(cpu_of(busiest), cpus);
2891 if (!cpumask_empty(cpus))
2892 goto redo;
2893 goto out_balanced;
2894 }
2895 }
2896
2897 if (!ld_moved) {
2898 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++;
2900
2901 if (need_active_balance(sd, sd_idle, idle)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags);
2903
2904 /* don't kick the active_load_balance_cpu_stop,
2905 * if the curr task on busiest cpu can't be
2906 * moved to this_cpu
2907 */
2908 if (!cpumask_test_cpu(this_cpu,
2909 &busiest->curr->cpus_allowed)) {
2910 raw_spin_unlock_irqrestore(&busiest->lock,
2911 flags);
2912 all_pinned = 1;
2913 goto out_one_pinned;
2914 }
2915
2916 /*
2917 * ->active_balance synchronizes accesses to
2918 * ->active_balance_work. Once set, it's cleared
2919 * only after active load balance is finished.
2920 */
2921 if (!busiest->active_balance) {
2922 busiest->active_balance = 1;
2923 busiest->push_cpu = this_cpu;
2924 active_balance = 1;
2925 }
2926 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2927
2928 if (active_balance)
2929 stop_one_cpu_nowait(cpu_of(busiest),
2930 active_load_balance_cpu_stop, busiest,
2931 &busiest->active_balance_work);
2932
2933 /*
2934 * We've kicked active balancing, reset the failure
2935 * counter.
2936 */
2937 sd->nr_balance_failed = sd->cache_nice_tries+1;
2938 }
2939 } else
2940 sd->nr_balance_failed = 0;
2941
2942 if (likely(!active_balance)) {
2943 /* We were unbalanced, so reset the balancing interval */
2944 sd->balance_interval = sd->min_interval;
2945 } else {
2946 /*
2947 * If we've begun active balancing, start to back off. This
2948 * case may not be covered by the all_pinned logic if there
2949 * is only 1 task on the busy runqueue (because we don't call
2950 * move_tasks).
2951 */
2952 if (sd->balance_interval < sd->max_interval)
2953 sd->balance_interval *= 2;
2954 }
2955
2956 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2957 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2958 ld_moved = -1;
2959
2960 goto out;
2961
2962out_balanced:
2963 schedstat_inc(sd, lb_balanced[idle]);
2964
2965 sd->nr_balance_failed = 0;
2966
2967out_one_pinned:
2968 /* tune up the balancing interval */
2969 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2970 (sd->balance_interval < sd->max_interval))
2971 sd->balance_interval *= 2;
2972
2973 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2974 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2975 ld_moved = -1;
2976 else
2977 ld_moved = 0;
2978out:
2979 if (ld_moved)
2980 update_shares(sd);
2981 return ld_moved;
2982}
2983
2984/*
2985 * idle_balance is called by schedule() if this_cpu is about to become
2986 * idle. Attempts to pull tasks from other CPUs.
2987 */
2988static void idle_balance(int this_cpu, struct rq *this_rq)
1934{ 2989{
1935 struct cfs_rq *busy_cfs_rq; 2990 struct sched_domain *sd;
1936 struct rq_iterator cfs_rq_iterator; 2991 int pulled_task = 0;
2992 unsigned long next_balance = jiffies + HZ;
1937 2993
1938 cfs_rq_iterator.start = load_balance_start_fair; 2994 this_rq->idle_stamp = this_rq->clock;
1939 cfs_rq_iterator.next = load_balance_next_fair; 2995
2996 if (this_rq->avg_idle < sysctl_sched_migration_cost)
2997 return;
2998
2999 /*
3000 * Drop the rq->lock, but keep IRQ/preempt disabled.
3001 */
3002 raw_spin_unlock(&this_rq->lock);
1940 3003
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 3004 for_each_domain(this_cpu, sd) {
3005 unsigned long interval;
3006 int balance = 1;
3007
3008 if (!(sd->flags & SD_LOAD_BALANCE))
3009 continue;
3010
3011 if (sd->flags & SD_BALANCE_NEWIDLE) {
3012 /* If we've pulled tasks over stop searching: */
3013 pulled_task = load_balance(this_cpu, this_rq,
3014 sd, CPU_NEWLY_IDLE, &balance);
3015 }
3016
3017 interval = msecs_to_jiffies(sd->balance_interval);
3018 if (time_after(next_balance, sd->last_balance + interval))
3019 next_balance = sd->last_balance + interval;
3020 if (pulled_task) {
3021 this_rq->idle_stamp = 0;
3022 break;
3023 }
3024 }
3025
3026 raw_spin_lock(&this_rq->lock);
3027
3028 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
1942 /* 3029 /*
1943 * pass busy_cfs_rq argument into 3030 * We are going idle. next_balance may be set based on
1944 * load_balance_[start|next]_fair iterators 3031 * a busy processor. So reset next_balance.
1945 */ 3032 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 3033 this_rq->next_balance = next_balance;
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 3034 }
1948 &cfs_rq_iterator)) 3035}
1949 return 1; 3036
3037/*
3038 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3039 * running tasks off the busiest CPU onto idle CPUs. It requires at
3040 * least 1 task to be running on each physical CPU where possible, and
3041 * avoids physical / logical imbalances.
3042 */
3043static int active_load_balance_cpu_stop(void *data)
3044{
3045 struct rq *busiest_rq = data;
3046 int busiest_cpu = cpu_of(busiest_rq);
3047 int target_cpu = busiest_rq->push_cpu;
3048 struct rq *target_rq = cpu_rq(target_cpu);
3049 struct sched_domain *sd;
3050
3051 raw_spin_lock_irq(&busiest_rq->lock);
3052
3053 /* make sure the requested cpu hasn't gone down in the meantime */
3054 if (unlikely(busiest_cpu != smp_processor_id() ||
3055 !busiest_rq->active_balance))
3056 goto out_unlock;
3057
3058 /* Is there any task to move? */
3059 if (busiest_rq->nr_running <= 1)
3060 goto out_unlock;
3061
3062 /*
3063 * This condition is "impossible", if it occurs
3064 * we need to fix it. Originally reported by
3065 * Bjorn Helgaas on a 128-cpu setup.
3066 */
3067 BUG_ON(busiest_rq == target_rq);
3068
3069 /* move a task from busiest_rq to target_rq */
3070 double_lock_balance(busiest_rq, target_rq);
3071
3072 /* Search for an sd spanning us and the target CPU. */
3073 for_each_domain(target_cpu, sd) {
3074 if ((sd->flags & SD_LOAD_BALANCE) &&
3075 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3076 break;
3077 }
3078
3079 if (likely(sd)) {
3080 schedstat_inc(sd, alb_count);
3081
3082 if (move_one_task(target_rq, target_cpu, busiest_rq,
3083 sd, CPU_IDLE))
3084 schedstat_inc(sd, alb_pushed);
3085 else
3086 schedstat_inc(sd, alb_failed);
3087 }
3088 double_unlock_balance(busiest_rq, target_rq);
3089out_unlock:
3090 busiest_rq->active_balance = 0;
3091 raw_spin_unlock_irq(&busiest_rq->lock);
3092 return 0;
3093}
3094
3095#ifdef CONFIG_NO_HZ
3096static struct {
3097 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask;
3099 cpumask_var_t ilb_grp_nohz_mask;
3100} nohz ____cacheline_aligned = {
3101 .load_balancer = ATOMIC_INIT(-1),
3102};
3103
3104int get_nohz_load_balancer(void)
3105{
3106 return atomic_read(&nohz.load_balancer);
3107}
3108
3109#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3110/**
3111 * lowest_flag_domain - Return lowest sched_domain containing flag.
3112 * @cpu: The cpu whose lowest level of sched domain is to
3113 * be returned.
3114 * @flag: The flag to check for the lowest sched_domain
3115 * for the given cpu.
3116 *
3117 * Returns the lowest sched_domain of a cpu which contains the given flag.
3118 */
3119static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3120{
3121 struct sched_domain *sd;
3122
3123 for_each_domain(cpu, sd)
3124 if (sd && (sd->flags & flag))
3125 break;
3126
3127 return sd;
3128}
3129
3130/**
3131 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3132 * @cpu: The cpu whose domains we're iterating over.
3133 * @sd: variable holding the value of the power_savings_sd
3134 * for cpu.
3135 * @flag: The flag to filter the sched_domains to be iterated.
3136 *
3137 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3138 * set, starting from the lowest sched_domain to the highest.
3139 */
3140#define for_each_flag_domain(cpu, sd, flag) \
3141 for (sd = lowest_flag_domain(cpu, flag); \
3142 (sd && (sd->flags & flag)); sd = sd->parent)
3143
3144/**
3145 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3146 * @ilb_group: group to be checked for semi-idleness
3147 *
3148 * Returns: 1 if the group is semi-idle. 0 otherwise.
3149 *
3150 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3151 * and atleast one non-idle CPU. This helper function checks if the given
3152 * sched_group is semi-idle or not.
3153 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3157 sched_group_cpus(ilb_group));
3158
3159 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu.
3162 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3164 return 0;
3165
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3167 return 0;
3168
3169 return 1;
3170}
3171/**
3172 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3173 * @cpu: The cpu which is nominating a new idle_load_balancer.
3174 *
3175 * Returns: Returns the id of the idle load balancer if it exists,
3176 * Else, returns >= nr_cpu_ids.
3177 *
3178 * This algorithm picks the idle load balancer such that it belongs to a
3179 * semi-idle powersavings sched_domain. The idea is to try and avoid
3180 * completely idle packages/cores just for the purpose of idle load balancing
3181 * when there are other idle cpu's which are better suited for that job.
3182 */
3183static int find_new_ilb(int cpu)
3184{
3185 struct sched_domain *sd;
3186 struct sched_group *ilb_group;
3187
3188 /*
3189 * Have idle load balancer selection from semi-idle packages only
3190 * when power-aware load balancing is enabled
3191 */
3192 if (!(sched_smt_power_savings || sched_mc_power_savings))
3193 goto out_done;
3194
3195 /*
3196 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2)
3200 goto out_done;
3201
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3203 ilb_group = sd->groups;
3204
3205 do {
3206 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask);
3208
3209 ilb_group = ilb_group->next;
3210
3211 } while (ilb_group != sd->groups);
1950 } 3212 }
1951 3213
3214out_done:
3215 return cpumask_first(nohz.cpu_mask);
3216}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu)
3219{
3220 return cpumask_first(nohz.cpu_mask);
3221}
3222#endif
3223
3224/*
3225 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 *
3236 * While stopping the tick, this cpu will become the ilb owner if there
3237 * is no other owner. And will be the owner till that cpu becomes busy
3238 * or if all cpus in the system stop their ticks at which point
3239 * there is no need for ilb owner.
3240 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the
3242 * next busy scheduler_tick()
3243 */
3244int select_nohz_load_balancer(int stop_tick)
3245{
3246 int cpu = smp_processor_id();
3247
3248 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0;
3254
3255 /*
3256 * If we are going offline and still the leader,
3257 * give up!
3258 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3260 BUG();
3261
3262 return 0;
3263 }
3264
3265 cpumask_set_cpu(cpu, nohz.cpu_mask);
3266
3267 /* time for ilb owner also to sleep */
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3269 if (atomic_read(&nohz.load_balancer) == cpu)
3270 atomic_set(&nohz.load_balancer, -1);
3271 return 0;
3272 }
3273
3274 if (atomic_read(&nohz.load_balancer) == -1) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb;
3280
3281 if (!(sched_smt_power_savings ||
3282 sched_mc_power_savings))
3283 return 1;
3284 /*
3285 * Check to see if there is a more power-efficient
3286 * ilb.
3287 */
3288 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1);
3291 resched_cpu(new_ilb);
3292 return 0;
3293 }
3294 return 1;
3295 }
3296 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3298 return 0;
3299
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3301
3302 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3304 BUG();
3305 }
1952 return 0; 3306 return 0;
1953} 3307}
3308#endif
3309
3310static DEFINE_SPINLOCK(balancing);
3311
3312/*
3313 * It checks each scheduling domain to see if it is due to be balanced,
3314 * and initiates a balancing operation if so.
3315 *
3316 * Balancing parameters are set up in arch_init_sched_domains.
3317 */
3318static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3319{
3320 int balance = 1;
3321 struct rq *rq = cpu_rq(cpu);
3322 unsigned long interval;
3323 struct sched_domain *sd;
3324 /* Earliest time when we have to do rebalance again */
3325 unsigned long next_balance = jiffies + 60*HZ;
3326 int update_next_balance = 0;
3327 int need_serialize;
3328
3329 for_each_domain(cpu, sd) {
3330 if (!(sd->flags & SD_LOAD_BALANCE))
3331 continue;
3332
3333 interval = sd->balance_interval;
3334 if (idle != CPU_IDLE)
3335 interval *= sd->busy_factor;
3336
3337 /* scale ms to jiffies */
3338 interval = msecs_to_jiffies(interval);
3339 if (unlikely(!interval))
3340 interval = 1;
3341 if (interval > HZ*NR_CPUS/10)
3342 interval = HZ*NR_CPUS/10;
3343
3344 need_serialize = sd->flags & SD_SERIALIZE;
3345
3346 if (need_serialize) {
3347 if (!spin_trylock(&balancing))
3348 goto out;
3349 }
3350
3351 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3352 if (load_balance(cpu, rq, sd, idle, &balance)) {
3353 /*
3354 * We've pulled tasks over so either we're no
3355 * longer idle, or one of our SMT siblings is
3356 * not idle.
3357 */
3358 idle = CPU_NOT_IDLE;
3359 }
3360 sd->last_balance = jiffies;
3361 }
3362 if (need_serialize)
3363 spin_unlock(&balancing);
3364out:
3365 if (time_after(next_balance, sd->last_balance + interval)) {
3366 next_balance = sd->last_balance + interval;
3367 update_next_balance = 1;
3368 }
3369
3370 /*
3371 * Stop the load balance at this level. There is another
3372 * CPU in our sched group which is doing load balancing more
3373 * actively.
3374 */
3375 if (!balance)
3376 break;
3377 }
3378
3379 /*
3380 * next_balance will be updated only when there is a need.
3381 * When the cpu is attached to null domain for ex, it will not be
3382 * updated.
3383 */
3384 if (likely(update_next_balance))
3385 rq->next_balance = next_balance;
3386}
3387
3388/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick.
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */
3393static void run_rebalance_domains(struct softirq_action *h)
3394{
3395 int this_cpu = smp_processor_id();
3396 struct rq *this_rq = cpu_rq(this_cpu);
3397 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3398 CPU_IDLE : CPU_NOT_IDLE;
3399
3400 rebalance_domains(this_cpu, idle);
3401
3402#ifdef CONFIG_NO_HZ
3403 /*
3404 * If this cpu is the owner for idle load balancing, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped.
3407 */
3408 if (this_rq->idle_at_tick &&
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433}
3434
3435static inline int on_null_domain(int cpu)
3436{
3437 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
3438}
3439
3440/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ);
3493}
1954 3494
1955static void rq_online_fair(struct rq *rq) 3495static void rq_online_fair(struct rq *rq)
1956{ 3496{
@@ -1962,6 +3502,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3502 update_sysctl();
1963} 3503}
1964 3504
3505#else /* CONFIG_SMP */
3506
3507/*
3508 * on UP we do not need to balance between CPUs:
3509 */
3510static inline void idle_balance(int cpu, struct rq *rq)
3511{
3512}
3513
1965#endif /* CONFIG_SMP */ 3514#endif /* CONFIG_SMP */
1966 3515
1967/* 3516/*
@@ -2076,7 +3625,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3625}
2077#endif 3626#endif
2078 3627
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3628static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3629{
2081 struct sched_entity *se = &task->se; 3630 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3631 unsigned int rr_interval = 0;
@@ -2108,8 +3657,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3657#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3658 .select_task_rq = select_task_rq_fair,
2110 3659
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3660 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3661 .rq_offline = rq_offline_fair,
2115 3662
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -607,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
607 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
608 delta_exec = 0; 614 delta_exec = 0;
609 615
610 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
611 617
612 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
613 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,27 +880,28 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
892} 902}
893 903
894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
895{ 905{
896 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
897 907
@@ -938,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
940 950
941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
942{ 953{
943 struct rq *rq = task_rq(p);
944
945 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id(); 955 return smp_processor_id();
947 956
@@ -1136,7 +1145,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1136 if (next && next->prio < idx) 1145 if (next && next->prio < idx)
1137 continue; 1146 continue;
1138 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1147 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1139 struct task_struct *p = rt_task_of(rt_se); 1148 struct task_struct *p;
1149
1150 if (!rt_entity_is_task(rt_se))
1151 continue;
1152
1153 p = rt_task_of(rt_se);
1140 if (pick_rt_task(rq, p, cpu)) { 1154 if (pick_rt_task(rq, p, cpu)) {
1141 next = p; 1155 next = p;
1142 break; 1156 break;
@@ -1481,24 +1495,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1495 push_rt_tasks(rq);
1482} 1496}
1483 1497
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1498static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1499 const struct cpumask *new_mask)
1504{ 1500{
@@ -1670,8 +1666,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1670 if (!p->signal) 1666 if (!p->signal)
1671 return; 1667 return;
1672 1668
1673 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; 1669 /* max may change after cur was read, this will be fixed next tick */
1674 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; 1670 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME);
1675 1672
1676 if (soft != RLIM_INFINITY) { 1673 if (soft != RLIM_INFINITY) {
1677 unsigned long next; 1674 unsigned long next;
@@ -1721,7 +1718,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1718 dequeue_pushable_task(rq, p);
1722} 1719}
1723 1720
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1721static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1722{
1726 /* 1723 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1724 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1743,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1743#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1744 .select_task_rq = select_task_rq_rt,
1748 1745
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1746 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1747 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1748 .rq_offline = rq_offline_rt,
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..906ae5a1779c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
159 159
160/* Given the mask, find the first available signal that should be serviced. */ 160/* Given the mask, find the first available signal that should be serviced. */
161 161
162#define SYNCHRONOUS_MASK \
163 (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
164 sigmask(SIGTRAP) | sigmask(SIGFPE))
165
162int next_signal(struct sigpending *pending, sigset_t *mask) 166int next_signal(struct sigpending *pending, sigset_t *mask)
163{ 167{
164 unsigned long i, *s, *m, x; 168 unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
166 170
167 s = pending->signal.sig; 171 s = pending->signal.sig;
168 m = mask->sig; 172 m = mask->sig;
173
174 /*
175 * Handle the first word specially: it contains the
176 * synchronous signals that need to be dequeued first.
177 */
178 x = *s &~ *m;
179 if (x) {
180 if (x & SYNCHRONOUS_MASK)
181 x &= SYNCHRONOUS_MASK;
182 sig = ffz(~x) + 1;
183 return sig;
184 }
185
169 switch (_NSIG_WORDS) { 186 switch (_NSIG_WORDS) {
170 default: 187 default:
171 for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) 188 for (i = 1; i < _NSIG_WORDS; ++i) {
172 if ((x = *s &~ *m) != 0) { 189 x = *++s &~ *++m;
173 sig = ffz(~x) + i*_NSIG_BPW + 1; 190 if (!x)
174 break; 191 continue;
175 } 192 sig = ffz(~x) + i*_NSIG_BPW + 1;
193 break;
194 }
176 break; 195 break;
177 196
178 case 2: if ((x = s[0] &~ m[0]) != 0) 197 case 2:
179 sig = 1; 198 x = s[1] &~ m[1];
180 else if ((x = s[1] &~ m[1]) != 0) 199 if (!x)
181 sig = _NSIG_BPW + 1;
182 else
183 break; 200 break;
184 sig += ffz(~x); 201 sig = ffz(~x) + _NSIG_BPW + 1;
185 break; 202 break;
186 203
187 case 1: if ((x = *s &~ *m) != 0) 204 case 1:
188 sig = ffz(~x) + 1; 205 /* Nothing to do */
189 break; 206 break;
190 } 207 }
191 208
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
228 245
229 if (override_rlimit || 246 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 247 atomic_read(&user->sigpending) <=
231 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { 248 task_rlimit(t, RLIMIT_SIGPENDING)) {
232 q = kmem_cache_alloc(sigqueue_cachep, flags); 249 q = kmem_cache_alloc(sigqueue_cachep, flags);
233 } else { 250 } else {
234 print_dropped_signal(sig); 251 print_dropped_signal(sig);
@@ -625,7 +642,7 @@ static inline bool si_fromuser(const struct siginfo *info)
625static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
626 struct task_struct *t) 643 struct task_struct *t)
627{ 644{
628 const struct cred *cred = current_cred(), *tcred; 645 const struct cred *cred, *tcred;
629 struct pid *sid; 646 struct pid *sid;
630 int error; 647 int error;
631 648
@@ -639,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
639 if (error) 656 if (error)
640 return error; 657 return error;
641 658
659 cred = current_cred();
642 tcred = __task_cred(t); 660 tcred = __task_cred(t);
643 if ((cred->euid ^ tcred->suid) && 661 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) &&
644 (cred->euid ^ tcred->uid) && 663 (cred->euid ^ tcred->uid) &&
645 (cred->uid ^ tcred->suid) && 664 (cred->uid ^ tcred->suid) &&
646 (cred->uid ^ tcred->uid) && 665 (cred->uid ^ tcred->uid) &&
@@ -1066,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1066/* 1085/*
1067 * Nuke all other threads in the group. 1086 * Nuke all other threads in the group.
1068 */ 1087 */
1069void zap_other_threads(struct task_struct *p) 1088int zap_other_threads(struct task_struct *p)
1070{ 1089{
1071 struct task_struct *t; 1090 struct task_struct *t = p;
1091 int count = 0;
1072 1092
1073 p->signal->group_stop_count = 0; 1093 p->signal->group_stop_count = 0;
1074 1094
1075 for (t = next_thread(p); t != p; t = next_thread(t)) { 1095 while_each_thread(p, t) {
1076 /* 1096 count++;
1077 * Don't bother with already dead threads 1097
1078 */ 1098 /* Don't bother with already dead threads */
1079 if (t->exit_state) 1099 if (t->exit_state)
1080 continue; 1100 continue;
1081
1082 /* SIGKILL will be handled before any pending SIGSTOP */
1083 sigaddset(&t->pending.signal, SIGKILL); 1101 sigaddset(&t->pending.signal, SIGKILL);
1084 signal_wake_up(t, 1); 1102 signal_wake_up(t, 1);
1085 } 1103 }
1104
1105 return count;
1086} 1106}
1087 1107
1088struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -2718,3 +2738,43 @@ void __init signals_init(void)
2718{ 2738{
2719 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 2739 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2720} 2740}
2741
2742#ifdef CONFIG_KGDB_KDB
2743#include <linux/kdb.h>
2744/*
2745 * kdb_send_sig_info - Allows kdb to send signals without exposing
2746 * signal internals. This function checks if the required locks are
2747 * available before calling the main signal code, to avoid kdb
2748 * deadlocks.
2749 */
2750void
2751kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
2752{
2753 static struct task_struct *kdb_prev_t;
2754 int sig, new_t;
2755 if (!spin_trylock(&t->sighand->siglock)) {
2756 kdb_printf("Can't do kill command now.\n"
2757 "The sigmask lock is held somewhere else in "
2758 "kernel, try again later\n");
2759 return;
2760 }
2761 spin_unlock(&t->sighand->siglock);
2762 new_t = kdb_prev_t != t;
2763 kdb_prev_t = t;
2764 if (t->state != TASK_RUNNING && new_t) {
2765 kdb_printf("Process is not RUNNING, sending a signal from "
2766 "kdb risks deadlock\n"
2767 "on the run queue locks. "
2768 "The signal has _not_ been sent.\n"
2769 "Reissue the kill command if you want to risk "
2770 "the deadlock.\n");
2771 return;
2772 }
2773 sig = info->si_signo;
2774 if (send_sig_info(sig, info, t))
2775 kdb_printf("Fail to deliver Signal %d to process %d.\n",
2776 sig, t->pid);
2777 else
2778 kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
2779}
2780#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
637 goto cancelled; 637 goto cancelled;
638 638
639 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
641 if (ret < 0) 641 if (ret < 0)
642 goto cant_get_ref; 642 goto cant_get_ref;
643 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,11 +9,10 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/cpu.h> 14#include <linux/cpu.h>
14 15
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 16static struct {
18 struct list_head queue; 17 struct list_head queue;
19 raw_spinlock_t lock; 18 raw_spinlock_t lock;
@@ -33,12 +32,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 32 cpumask_var_t cpumask;
34}; 33};
35 34
35static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
36
36struct call_single_queue { 37struct call_single_queue {
37 struct list_head list; 38 struct list_head list;
38 raw_spinlock_t lock; 39 raw_spinlock_t lock;
39}; 40};
40 41
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 42static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 43
43static int 44static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 45hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -51,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51 case CPU_UP_PREPARE_FROZEN: 52 case CPU_UP_PREPARE_FROZEN:
52 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
53 cpu_to_node(cpu))) 54 cpu_to_node(cpu)))
54 return NOTIFY_BAD; 55 return notifier_from_errno(-ENOMEM);
55 break; 56 break;
56 57
57#ifdef CONFIG_HOTPLUG_CPU 58#ifdef CONFIG_HOTPLUG_CPU
@@ -256,7 +257,7 @@ void generic_smp_call_function_single_interrupt(void)
256 } 257 }
257} 258}
258 259
259static DEFINE_PER_CPU(struct call_single_data, csd_data); 260static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
260 261
261/* 262/*
262 * smp_call_function_single - Run a function on a specific CPU 263 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
@@ -721,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
722 cond_resched(); 717 cond_resched();
723 preempt_disable(); 718 preempt_disable();
724 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
725 } 720 }
726 preempt_enable(); 721 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
@@ -813,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
813 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 811 return notifier_from_errno(PTR_ERR(p));
817 } 812 }
818 kthread_bind(p, hotcpu); 813 kthread_bind(p, hotcpu);
819 per_cpu(ksoftirqd, hotcpu) = p; 814 per_cpu(ksoftirqd, hotcpu) = p;
@@ -855,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
855 void *cpu = (void *)(long)smp_processor_id(); 850 void *cpu = (void *)(long)smp_processor_id();
856 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
857 852
858 BUG_ON(err == NOTIFY_BAD); 853 BUG_ON(err != NOTIFY_OK);
859 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
860 register_cpu_notifier(&cpu_nfb); 855 register_cpu_notifier(&cpu_nfb);
861 return 0; 856 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
118 } 125 }
119 126
120 if (touch_ts == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
@@ -140,11 +155,11 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_ts + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
144 wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_ts + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
148 return; 163 return;
149 164
150 per_cpu(softlockup_print_ts, this_cpu) = touch_ts; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,10 +30,33 @@
30#include <linux/preempt.h> 30#include <linux/preempt.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/slab.h>
34#include <linux/smp.h> 33#include <linux/smp.h>
35#include <linux/srcu.h> 34#include <linux/srcu.h>
36 35
36static int init_srcu_struct_fields(struct srcu_struct *sp)
37{
38 sp->completed = 0;
39 mutex_init(&sp->mutex);
40 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
41 return sp->per_cpu_ref ? 0 : -ENOMEM;
42}
43
44#ifdef CONFIG_DEBUG_LOCK_ALLOC
45
46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key)
48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp);
55}
56EXPORT_SYMBOL_GPL(__init_srcu_struct);
57
58#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
59
37/** 60/**
38 * init_srcu_struct - initialize a sleep-RCU structure 61 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 62 * @sp: structure to initialize.
@@ -44,13 +67,12 @@
44 */ 67 */
45int init_srcu_struct(struct srcu_struct *sp) 68int init_srcu_struct(struct srcu_struct *sp)
46{ 69{
47 sp->completed = 0; 70 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 71}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 72EXPORT_SYMBOL_GPL(init_srcu_struct);
53 73
74#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
75
54/* 76/*
55 * srcu_readers_active_idx -- returns approximate number of readers 77 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 78 * active on the specified rank of per-CPU counters.
@@ -100,15 +122,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 122}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 123EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 124
103/** 125/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 126 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 127 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 128 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 129 */
111int srcu_read_lock(struct srcu_struct *sp) 130int __srcu_read_lock(struct srcu_struct *sp)
112{ 131{
113 int idx; 132 int idx;
114 133
@@ -120,31 +139,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 139 preempt_enable();
121 return idx; 140 return idx;
122} 141}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 142EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 143
125/** 144/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 145 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 146 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 147 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 148 * Must be called from process context.
134 */ 149 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 150void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 151{
137 preempt_disable(); 152 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 153 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 154 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 155 preempt_enable();
141} 156}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 157EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 158
144/* 159/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 160 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 161 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 162static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 163{
149 int idx; 164 int idx;
150 165
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_POST_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/personality.h>
36#include <linux/ptrace.h> 37#include <linux/ptrace.h>
37#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h>
38 40
39#include <linux/compat.h> 41#include <linux/compat.h>
40#include <linux/syscalls.h> 42#include <linux/syscalls.h>
@@ -222,6 +224,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
222 if (which > PRIO_USER || which < PRIO_PROCESS) 224 if (which > PRIO_USER || which < PRIO_PROCESS)
223 return -EINVAL; 225 return -EINVAL;
224 226
227 rcu_read_lock();
225 read_lock(&tasklist_lock); 228 read_lock(&tasklist_lock);
226 switch (which) { 229 switch (which) {
227 case PRIO_PROCESS: 230 case PRIO_PROCESS:
@@ -267,6 +270,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
267 } 270 }
268out_unlock: 271out_unlock:
269 read_unlock(&tasklist_lock); 272 read_unlock(&tasklist_lock);
273 rcu_read_unlock();
270 274
271 return retval; 275 return retval;
272} 276}
@@ -488,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
488 return -ENOMEM; 492 return -ENOMEM;
489 old = current_cred(); 493 old = current_cred();
490 494
491 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
492 if (retval)
493 goto error;
494
495 retval = -EPERM; 495 retval = -EPERM;
496 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
497 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -539,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
539 return -ENOMEM; 539 return -ENOMEM;
540 old = current_cred(); 540 old = current_cred();
541 541
542 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
543 if (retval)
544 goto error;
545
546 retval = -EPERM; 542 retval = -EPERM;
547 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
548 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -569,13 +565,7 @@ static int set_user(struct cred *new)
569 if (!new_user) 565 if (!new_user)
570 return -EAGAIN; 566 return -EAGAIN;
571 567
572 if (!task_can_switch_user(new_user, current)) { 568 if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
573 free_uid(new_user);
574 return -EINVAL;
575 }
576
577 if (atomic_read(&new_user->processes) >=
578 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
579 new_user != INIT_USER) { 569 new_user != INIT_USER) {
580 free_uid(new_user); 570 free_uid(new_user);
581 return -EAGAIN; 571 return -EAGAIN;
@@ -612,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
612 return -ENOMEM; 602 return -ENOMEM;
613 old = current_cred(); 603 old = current_cred();
614 604
615 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
616 if (retval)
617 goto error;
618
619 retval = -EPERM; 605 retval = -EPERM;
620 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
621 new->uid = ruid; 607 new->uid = ruid;
@@ -677,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
677 return -ENOMEM; 663 return -ENOMEM;
678 old = current_cred(); 664 old = current_cred();
679 665
680 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
681 if (retval)
682 goto error;
683
684 retval = -EPERM; 666 retval = -EPERM;
685 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
686 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -721,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
721 if (!new) 703 if (!new)
722 return -ENOMEM; 704 return -ENOMEM;
723 705
724 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
725 if (retval)
726 goto error;
727 old = current_cred(); 706 old = current_cred();
728 707
729 retval = -EPERM; 708 retval = -EPERM;
@@ -790,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
790 return -ENOMEM; 769 return -ENOMEM;
791 old = current_cred(); 770 old = current_cred();
792 771
793 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
794 if (retval)
795 goto error;
796
797 retval = -EPERM; 772 retval = -EPERM;
798 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
799 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -853,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
853 old = current_cred(); 828 old = current_cred();
854 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
855 830
856 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
857 goto error;
858
859 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
860 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
861 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -866,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
866 } 838 }
867 } 839 }
868 840
869error:
870 abort_creds(new); 841 abort_creds(new);
871 return old_fsuid; 842 return old_fsuid;
872 843
@@ -890,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
890 old = current_cred(); 861 old = current_cred();
891 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
892 863
893 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
894 goto error;
895
896 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
897 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
898 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -902,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
902 } 870 }
903 } 871 }
904 872
905error:
906 abort_creds(new); 873 abort_creds(new);
907 return old_fsgid; 874 return old_fsgid;
908 875
@@ -1118,6 +1085,15 @@ out:
1118 1085
1119DECLARE_RWSEM(uts_sem); 1086DECLARE_RWSEM(uts_sem);
1120 1087
1088#ifdef COMPAT_UTS_MACHINE
1089#define override_architecture(name) \
1090 (personality(current->personality) == PER_LINUX32 && \
1091 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1092 sizeof(COMPAT_UTS_MACHINE)))
1093#else
1094#define override_architecture(name) 0
1095#endif
1096
1121SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1097SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1122{ 1098{
1123 int errno = 0; 1099 int errno = 0;
@@ -1126,9 +1102,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1126 if (copy_to_user(name, utsname(), sizeof *name)) 1102 if (copy_to_user(name, utsname(), sizeof *name))
1127 errno = -EFAULT; 1103 errno = -EFAULT;
1128 up_read(&uts_sem); 1104 up_read(&uts_sem);
1105
1106 if (!errno && override_architecture(name))
1107 errno = -EFAULT;
1129 return errno; 1108 return errno;
1130} 1109}
1131 1110
1111#ifdef __ARCH_WANT_SYS_OLD_UNAME
1112/*
1113 * Old cruft
1114 */
1115SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
1116{
1117 int error = 0;
1118
1119 if (!name)
1120 return -EFAULT;
1121
1122 down_read(&uts_sem);
1123 if (copy_to_user(name, utsname(), sizeof(*name)))
1124 error = -EFAULT;
1125 up_read(&uts_sem);
1126
1127 if (!error && override_architecture(name))
1128 error = -EFAULT;
1129 return error;
1130}
1131
1132SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
1133{
1134 int error;
1135
1136 if (!name)
1137 return -EFAULT;
1138 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
1139 return -EFAULT;
1140
1141 down_read(&uts_sem);
1142 error = __copy_to_user(&name->sysname, &utsname()->sysname,
1143 __OLD_UTS_LEN);
1144 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
1145 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
1146 __OLD_UTS_LEN);
1147 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
1148 error |= __copy_to_user(&name->release, &utsname()->release,
1149 __OLD_UTS_LEN);
1150 error |= __put_user(0, name->release + __OLD_UTS_LEN);
1151 error |= __copy_to_user(&name->version, &utsname()->version,
1152 __OLD_UTS_LEN);
1153 error |= __put_user(0, name->version + __OLD_UTS_LEN);
1154 error |= __copy_to_user(&name->machine, &utsname()->machine,
1155 __OLD_UTS_LEN);
1156 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
1157 up_read(&uts_sem);
1158
1159 if (!error && override_architecture(name))
1160 error = -EFAULT;
1161 return error ? -EFAULT : 0;
1162}
1163#endif
1164
1132SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) 1165SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1133{ 1166{
1134 int errno; 1167 int errno;
@@ -1599,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1599 1632
1600char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 1633char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
1601 1634
1602static void argv_cleanup(char **argv, char **envp) 1635static void argv_cleanup(struct subprocess_info *info)
1603{ 1636{
1604 argv_free(argv); 1637 argv_free(info->argv);
1605} 1638}
1606 1639
1607/** 1640/**
@@ -1635,7 +1668,7 @@ int orderly_poweroff(bool force)
1635 goto out; 1668 goto out;
1636 } 1669 }
1637 1670
1638 call_usermodehelper_setcleanup(info, argv_cleanup); 1671 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
1639 1672
1640 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 1673 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
1641 1674
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
126cond_syscall(sys_setuid16); 126cond_syscall(sys_setuid16);
127cond_syscall(sys_vm86old); 127cond_syscall(sys_vm86old);
128cond_syscall(sys_vm86); 128cond_syscall(sys_vm86);
129cond_syscall(sys_ipc);
129cond_syscall(compat_sys_ipc); 130cond_syscall(compat_sys_ipc);
130cond_syscall(compat_sys_sysctl); 131cond_syscall(compat_sys_sysctl);
131cond_syscall(sys_flock); 132cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/signal.h>
26#include <linux/proc_fs.h> 27#include <linux/proc_fs.h>
27#include <linux/security.h> 28#include <linux/security.h>
28#include <linux/ctype.h> 29#include <linux/ctype.h>
@@ -36,6 +37,7 @@
36#include <linux/highuid.h> 37#include <linux/highuid.h>
37#include <linux/writeback.h> 38#include <linux/writeback.h>
38#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
39#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
40#include <linux/initrd.h> 42#include <linux/initrd.h>
41#include <linux/key.h> 43#include <linux/key.h>
@@ -50,6 +52,8 @@
50#include <linux/ftrace.h> 52#include <linux/ftrace.h>
51#include <linux/slow-work.h> 53#include <linux/slow-work.h>
52#include <linux/perf_event.h> 54#include <linux/perf_event.h>
55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h>
53 57
54#include <asm/uaccess.h> 58#include <asm/uaccess.h>
55#include <asm/processor.h> 59#include <asm/processor.h>
@@ -59,13 +63,23 @@
59#include <asm/stacktrace.h> 63#include <asm/stacktrace.h>
60#include <asm/io.h> 64#include <asm/io.h>
61#endif 65#endif
66#ifdef CONFIG_BSD_PROCESS_ACCT
67#include <linux/acct.h>
68#endif
69#ifdef CONFIG_RT_MUTEXES
70#include <linux/rtmutex.h>
71#endif
72#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
73#include <linux/lockdep.h>
74#endif
75#ifdef CONFIG_CHR_DEV_SG
76#include <scsi/sg.h>
77#endif
62 78
63 79
64#if defined(CONFIG_SYSCTL) 80#if defined(CONFIG_SYSCTL)
65 81
66/* External variables not in a header file. */ 82/* External variables not in a header file. */
67extern int C_A_D;
68extern int print_fatal_signals;
69extern int sysctl_overcommit_memory; 83extern int sysctl_overcommit_memory;
70extern int sysctl_overcommit_ratio; 84extern int sysctl_overcommit_ratio;
71extern int sysctl_panic_on_oom; 85extern int sysctl_panic_on_oom;
@@ -87,9 +101,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
87#ifndef CONFIG_MMU 101#ifndef CONFIG_MMU
88extern int sysctl_nr_trim_pages; 102extern int sysctl_nr_trim_pages;
89#endif 103#endif
90#ifdef CONFIG_RCU_TORTURE_TEST
91extern int rcutorture_runnable;
92#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
93#ifdef CONFIG_BLOCK 104#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 105extern int blk_iopoll_enabled;
95#endif 106#endif
@@ -119,14 +130,6 @@ static int min_percpu_pagelist_fract = 8;
119 130
120static int ngroups_max = NGROUPS_MAX; 131static int ngroups_max = NGROUPS_MAX;
121 132
122#ifdef CONFIG_MODULES
123extern char modprobe_path[];
124extern int modules_disabled;
125#endif
126#ifdef CONFIG_CHR_DEV_SG
127extern int sg_big_buff;
128#endif
129
130#ifdef CONFIG_SPARC 133#ifdef CONFIG_SPARC
131#include <asm/system.h> 134#include <asm/system.h>
132#endif 135#endif
@@ -148,10 +151,6 @@ extern int sysctl_userprocess_debug;
148extern int spin_retry; 151extern int spin_retry;
149#endif 152#endif
150 153
151#ifdef CONFIG_BSD_PROCESS_ACCT
152extern int acct_parm[];
153#endif
154
155#ifdef CONFIG_IA64 154#ifdef CONFIG_IA64
156extern int no_unaligned_warning; 155extern int no_unaligned_warning;
157extern int unaligned_dump_stack; 156extern int unaligned_dump_stack;
@@ -159,10 +158,6 @@ extern int unaligned_dump_stack;
159 158
160extern struct ratelimit_state printk_ratelimit_state; 159extern struct ratelimit_state printk_ratelimit_state;
161 160
162#ifdef CONFIG_RT_MUTEXES
163extern int max_lock_depth;
164#endif
165
166#ifdef CONFIG_PROC_SYSCTL 161#ifdef CONFIG_PROC_SYSCTL
167static int proc_do_cad_pid(struct ctl_table *table, int write, 162static int proc_do_cad_pid(struct ctl_table *table, int write,
168 void __user *buffer, size_t *lenp, loff_t *ppos); 163 void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -170,6 +165,27 @@ static int proc_taint(struct ctl_table *table, int write,
170 void __user *buffer, size_t *lenp, loff_t *ppos); 165 void __user *buffer, size_t *lenp, loff_t *ppos);
171#endif 166#endif
172 167
168#ifdef CONFIG_MAGIC_SYSRQ
169static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
170
171static int sysrq_sysctl_handler(ctl_table *table, int write,
172 void __user *buffer, size_t *lenp,
173 loff_t *ppos)
174{
175 int error;
176
177 error = proc_dointvec(table, write, buffer, lenp, ppos);
178 if (error)
179 return error;
180
181 if (write)
182 sysrq_toggle_support(__sysrq_enabled);
183
184 return 0;
185}
186
187#endif
188
173static struct ctl_table root_table[]; 189static struct ctl_table root_table[];
174static struct ctl_table_root sysctl_table_root; 190static struct ctl_table_root sysctl_table_root;
175static struct ctl_table_header root_table_header = { 191static struct ctl_table_header root_table_header = {
@@ -201,9 +217,6 @@ extern struct ctl_table epoll_table[];
201int sysctl_legacy_va_layout; 217int sysctl_legacy_va_layout;
202#endif 218#endif
203 219
204extern int prove_locking;
205extern int lock_stat;
206
207/* The default sysctl tables: */ 220/* The default sysctl tables: */
208 221
209static struct ctl_table root_table[] = { 222static struct ctl_table root_table[] = {
@@ -250,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
251#endif 264#endif
252 265
266#ifdef CONFIG_COMPACTION
267static int min_extfrag_threshold;
268static int max_extfrag_threshold = 1000;
269#endif
270
253static struct ctl_table kern_table[] = { 271static struct ctl_table kern_table[] = {
254 { 272 {
255 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
@@ -577,7 +595,7 @@ static struct ctl_table kern_table[] = {
577 .data = &__sysrq_enabled, 595 .data = &__sysrq_enabled,
578 .maxlen = sizeof (int), 596 .maxlen = sizeof (int),
579 .mode = 0644, 597 .mode = 0644,
580 .proc_handler = proc_dointvec, 598 .proc_handler = sysrq_sysctl_handler,
581 }, 599 },
582#endif 600#endif
583#ifdef CONFIG_PROC_SYSCTL 601#ifdef CONFIG_PROC_SYSCTL
@@ -631,7 +649,7 @@ static struct ctl_table kern_table[] = {
631#endif 649#endif
632 { 650 {
633 .procname = "userprocess_debug", 651 .procname = "userprocess_debug",
634 .data = &sysctl_userprocess_debug, 652 .data = &show_unhandled_signals,
635 .maxlen = sizeof(int), 653 .maxlen = sizeof(int),
636 .mode = 0644, 654 .mode = 0644,
637 .proc_handler = proc_dointvec, 655 .proc_handler = proc_dointvec,
@@ -1109,6 +1127,25 @@ static struct ctl_table vm_table[] = {
1109 .mode = 0644, 1127 .mode = 0644,
1110 .proc_handler = drop_caches_sysctl_handler, 1128 .proc_handler = drop_caches_sysctl_handler,
1111 }, 1129 },
1130#ifdef CONFIG_COMPACTION
1131 {
1132 .procname = "compact_memory",
1133 .data = &sysctl_compact_memory,
1134 .maxlen = sizeof(int),
1135 .mode = 0200,
1136 .proc_handler = sysctl_compaction_handler,
1137 },
1138 {
1139 .procname = "extfrag_threshold",
1140 .data = &sysctl_extfrag_threshold,
1141 .maxlen = sizeof(int),
1142 .mode = 0644,
1143 .proc_handler = sysctl_extfrag_handler,
1144 .extra1 = &min_extfrag_threshold,
1145 .extra2 = &max_extfrag_threshold,
1146 },
1147
1148#endif /* CONFIG_COMPACTION */
1112 { 1149 {
1113 .procname = "min_free_kbytes", 1150 .procname = "min_free_kbytes",
1114 .data = &min_free_kbytes, 1151 .data = &min_free_kbytes,
@@ -1433,6 +1470,14 @@ static struct ctl_table fs_table[] = {
1433 .child = binfmt_misc_table, 1470 .child = binfmt_misc_table,
1434 }, 1471 },
1435#endif 1472#endif
1473 {
1474 .procname = "pipe-max-size",
1475 .data = &pipe_max_size,
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &pipe_min_size,
1480 },
1436/* 1481/*
1437 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
1438 * Documentation/sysctl/ctl_unnumbered.txt 1483 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1441,7 +1486,8 @@ static struct ctl_table fs_table[] = {
1441}; 1486};
1442 1487
1443static struct ctl_table debug_table[] = { 1488static struct ctl_table debug_table[] = {
1444#if defined(CONFIG_X86) || defined(CONFIG_PPC) 1489#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1490 defined(CONFIG_S390)
1445 { 1491 {
1446 .procname = "exception-trace", 1492 .procname = "exception-trace",
1447 .data = &show_unhandled_signals, 1493 .data = &show_unhandled_signals,
@@ -1450,6 +1496,17 @@ static struct ctl_table debug_table[] = {
1450 .proc_handler = proc_dointvec 1496 .proc_handler = proc_dointvec
1451 }, 1497 },
1452#endif 1498#endif
1499#if defined(CONFIG_OPTPROBES)
1500 {
1501 .procname = "kprobes-optimization",
1502 .data = &sysctl_kprobes_optimization,
1503 .maxlen = sizeof(int),
1504 .mode = 0644,
1505 .proc_handler = proc_kprobes_optimization_handler,
1506 .extra1 = &zero,
1507 .extra2 = &one,
1508 },
1509#endif
1453 { } 1510 { }
1454}; 1511};
1455 1512
@@ -2039,8 +2096,132 @@ int proc_dostring(struct ctl_table *table, int write,
2039 buffer, lenp, ppos); 2096 buffer, lenp, ppos);
2040} 2097}
2041 2098
2099static size_t proc_skip_spaces(char **buf)
2100{
2101 size_t ret;
2102 char *tmp = skip_spaces(*buf);
2103 ret = tmp - *buf;
2104 *buf = tmp;
2105 return ret;
2106}
2107
2108static void proc_skip_char(char **buf, size_t *size, const char v)
2109{
2110 while (*size) {
2111 if (**buf != v)
2112 break;
2113 (*size)--;
2114 (*buf)++;
2115 }
2116}
2117
2118#define TMPBUFLEN 22
2119/**
2120 * proc_get_long - reads an ASCII formatted integer from a user buffer
2121 *
2122 * @buf: a kernel buffer
2123 * @size: size of the kernel buffer
2124 * @val: this is where the number will be stored
2125 * @neg: set to %TRUE if number is negative
2126 * @perm_tr: a vector which contains the allowed trailers
2127 * @perm_tr_len: size of the perm_tr vector
2128 * @tr: pointer to store the trailer character
2129 *
2130 * In case of success %0 is returned and @buf and @size are updated with
2131 * the amount of bytes read. If @tr is non-NULL and a trailing
2132 * character exists (size is non-zero after returning from this
2133 * function), @tr is updated with the trailing character.
2134 */
2135static int proc_get_long(char **buf, size_t *size,
2136 unsigned long *val, bool *neg,
2137 const char *perm_tr, unsigned perm_tr_len, char *tr)
2138{
2139 int len;
2140 char *p, tmp[TMPBUFLEN];
2141
2142 if (!*size)
2143 return -EINVAL;
2144
2145 len = *size;
2146 if (len > TMPBUFLEN - 1)
2147 len = TMPBUFLEN - 1;
2148
2149 memcpy(tmp, *buf, len);
2150
2151 tmp[len] = 0;
2152 p = tmp;
2153 if (*p == '-' && *size > 1) {
2154 *neg = true;
2155 p++;
2156 } else
2157 *neg = false;
2158 if (!isdigit(*p))
2159 return -EINVAL;
2160
2161 *val = simple_strtoul(p, &p, 0);
2162
2163 len = p - tmp;
2164
2165 /* We don't know if the next char is whitespace thus we may accept
2166 * invalid integers (e.g. 1234...a) or two integers instead of one
2167 * (e.g. 123...1). So lets not allow such large numbers. */
2168 if (len == TMPBUFLEN - 1)
2169 return -EINVAL;
2170
2171 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2172 return -EINVAL;
2173
2174 if (tr && (len < *size))
2175 *tr = *p;
2176
2177 *buf += len;
2178 *size -= len;
2179
2180 return 0;
2181}
2182
2183/**
2184 * proc_put_long - converts an integer to a decimal ASCII formatted string
2185 *
2186 * @buf: the user buffer
2187 * @size: the size of the user buffer
2188 * @val: the integer to be converted
2189 * @neg: sign of the number, %TRUE for negative
2190 *
2191 * In case of success %0 is returned and @buf and @size are updated with
2192 * the amount of bytes written.
2193 */
2194static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2195 bool neg)
2196{
2197 int len;
2198 char tmp[TMPBUFLEN], *p = tmp;
2199
2200 sprintf(p, "%s%lu", neg ? "-" : "", val);
2201 len = strlen(tmp);
2202 if (len > *size)
2203 len = *size;
2204 if (copy_to_user(*buf, tmp, len))
2205 return -EFAULT;
2206 *size -= len;
2207 *buf += len;
2208 return 0;
2209}
2210#undef TMPBUFLEN
2211
2212static int proc_put_char(void __user **buf, size_t *size, char c)
2213{
2214 if (*size) {
2215 char __user **buffer = (char __user **)buf;
2216 if (put_user(c, *buffer))
2217 return -EFAULT;
2218 (*size)--, (*buffer)++;
2219 *buf = *buffer;
2220 }
2221 return 0;
2222}
2042 2223
2043static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2224static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2044 int *valp, 2225 int *valp,
2045 int write, void *data) 2226 int write, void *data)
2046{ 2227{
@@ -2049,33 +2230,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2049 } else { 2230 } else {
2050 int val = *valp; 2231 int val = *valp;
2051 if (val < 0) { 2232 if (val < 0) {
2052 *negp = -1; 2233 *negp = true;
2053 *lvalp = (unsigned long)-val; 2234 *lvalp = (unsigned long)-val;
2054 } else { 2235 } else {
2055 *negp = 0; 2236 *negp = false;
2056 *lvalp = (unsigned long)val; 2237 *lvalp = (unsigned long)val;
2057 } 2238 }
2058 } 2239 }
2059 return 0; 2240 return 0;
2060} 2241}
2061 2242
2243static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2244
2062static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2245static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2063 int write, void __user *buffer, 2246 int write, void __user *buffer,
2064 size_t *lenp, loff_t *ppos, 2247 size_t *lenp, loff_t *ppos,
2065 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2248 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2066 int write, void *data), 2249 int write, void *data),
2067 void *data) 2250 void *data)
2068{ 2251{
2069#define TMPBUFLEN 21 2252 int *i, vleft, first = 1, err = 0;
2070 int *i, vleft, first = 1, neg; 2253 unsigned long page = 0;
2071 unsigned long lval; 2254 size_t left;
2072 size_t left, len; 2255 char *kbuf;
2073
2074 char buf[TMPBUFLEN], *p;
2075 char __user *s = buffer;
2076 2256
2077 if (!tbl_data || !table->maxlen || !*lenp || 2257 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2078 (*ppos && !write)) {
2079 *lenp = 0; 2258 *lenp = 0;
2080 return 0; 2259 return 0;
2081 } 2260 }
@@ -2087,89 +2266,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2087 if (!conv) 2266 if (!conv)
2088 conv = do_proc_dointvec_conv; 2267 conv = do_proc_dointvec_conv;
2089 2268
2269 if (write) {
2270 if (left > PAGE_SIZE - 1)
2271 left = PAGE_SIZE - 1;
2272 page = __get_free_page(GFP_TEMPORARY);
2273 kbuf = (char *) page;
2274 if (!kbuf)
2275 return -ENOMEM;
2276 if (copy_from_user(kbuf, buffer, left)) {
2277 err = -EFAULT;
2278 goto free;
2279 }
2280 kbuf[left] = 0;
2281 }
2282
2090 for (; left && vleft--; i++, first=0) { 2283 for (; left && vleft--; i++, first=0) {
2284 unsigned long lval;
2285 bool neg;
2286
2091 if (write) { 2287 if (write) {
2092 while (left) { 2288 left -= proc_skip_spaces(&kbuf);
2093 char c; 2289
2094 if (get_user(c, s))
2095 return -EFAULT;
2096 if (!isspace(c))
2097 break;
2098 left--;
2099 s++;
2100 }
2101 if (!left) 2290 if (!left)
2102 break; 2291 break;
2103 neg = 0; 2292 err = proc_get_long(&kbuf, &left, &lval, &neg,
2104 len = left; 2293 proc_wspace_sep,
2105 if (len > sizeof(buf) - 1) 2294 sizeof(proc_wspace_sep), NULL);
2106 len = sizeof(buf) - 1; 2295 if (err)
2107 if (copy_from_user(buf, s, len))
2108 return -EFAULT;
2109 buf[len] = 0;
2110 p = buf;
2111 if (*p == '-' && left > 1) {
2112 neg = 1;
2113 p++;
2114 }
2115 if (*p < '0' || *p > '9')
2116 break;
2117
2118 lval = simple_strtoul(p, &p, 0);
2119
2120 len = p-buf;
2121 if ((len < left) && *p && !isspace(*p))
2122 break; 2296 break;
2123 s += len; 2297 if (conv(&neg, &lval, i, 1, data)) {
2124 left -= len; 2298 err = -EINVAL;
2125
2126 if (conv(&neg, &lval, i, 1, data))
2127 break; 2299 break;
2300 }
2128 } else { 2301 } else {
2129 p = buf; 2302 if (conv(&neg, &lval, i, 0, data)) {
2303 err = -EINVAL;
2304 break;
2305 }
2130 if (!first) 2306 if (!first)
2131 *p++ = '\t'; 2307 err = proc_put_char(&buffer, &left, '\t');
2132 2308 if (err)
2133 if (conv(&neg, &lval, i, 0, data)) 2309 break;
2310 err = proc_put_long(&buffer, &left, lval, neg);
2311 if (err)
2134 break; 2312 break;
2135
2136 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2137 len = strlen(buf);
2138 if (len > left)
2139 len = left;
2140 if(copy_to_user(s, buf, len))
2141 return -EFAULT;
2142 left -= len;
2143 s += len;
2144 } 2313 }
2145 } 2314 }
2146 2315
2147 if (!write && !first && left) { 2316 if (!write && !first && left && !err)
2148 if(put_user('\n', s)) 2317 err = proc_put_char(&buffer, &left, '\n');
2149 return -EFAULT; 2318 if (write && !err && left)
2150 left--, s++; 2319 left -= proc_skip_spaces(&kbuf);
2151 } 2320free:
2152 if (write) { 2321 if (write) {
2153 while (left) { 2322 free_page(page);
2154 char c; 2323 if (first)
2155 if (get_user(c, s++)) 2324 return err ? : -EINVAL;
2156 return -EFAULT;
2157 if (!isspace(c))
2158 break;
2159 left--;
2160 }
2161 } 2325 }
2162 if (write && first)
2163 return -EINVAL;
2164 *lenp -= left; 2326 *lenp -= left;
2165 *ppos += *lenp; 2327 *ppos += *lenp;
2166 return 0; 2328 return err;
2167#undef TMPBUFLEN
2168} 2329}
2169 2330
2170static int do_proc_dointvec(struct ctl_table *table, int write, 2331static int do_proc_dointvec(struct ctl_table *table, int write,
2171 void __user *buffer, size_t *lenp, loff_t *ppos, 2332 void __user *buffer, size_t *lenp, loff_t *ppos,
2172 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2333 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2173 int write, void *data), 2334 int write, void *data),
2174 void *data) 2335 void *data)
2175{ 2336{
@@ -2237,8 +2398,8 @@ struct do_proc_dointvec_minmax_conv_param {
2237 int *max; 2398 int *max;
2238}; 2399};
2239 2400
2240static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2401static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2241 int *valp, 2402 int *valp,
2242 int write, void *data) 2403 int write, void *data)
2243{ 2404{
2244 struct do_proc_dointvec_minmax_conv_param *param = data; 2405 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2251,10 +2412,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2251 } else { 2412 } else {
2252 int val = *valp; 2413 int val = *valp;
2253 if (val < 0) { 2414 if (val < 0) {
2254 *negp = -1; 2415 *negp = true;
2255 *lvalp = (unsigned long)-val; 2416 *lvalp = (unsigned long)-val;
2256 } else { 2417 } else {
2257 *negp = 0; 2418 *negp = false;
2258 *lvalp = (unsigned long)val; 2419 *lvalp = (unsigned long)val;
2259 } 2420 }
2260 } 2421 }
@@ -2294,102 +2455,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2294 unsigned long convmul, 2455 unsigned long convmul,
2295 unsigned long convdiv) 2456 unsigned long convdiv)
2296{ 2457{
2297#define TMPBUFLEN 21 2458 unsigned long *i, *min, *max;
2298 unsigned long *i, *min, *max, val; 2459 int vleft, first = 1, err = 0;
2299 int vleft, first=1, neg; 2460 unsigned long page = 0;
2300 size_t len, left; 2461 size_t left;
2301 char buf[TMPBUFLEN], *p; 2462 char *kbuf;
2302 char __user *s = buffer; 2463
2303 2464 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2304 if (!data || !table->maxlen || !*lenp ||
2305 (*ppos && !write)) {
2306 *lenp = 0; 2465 *lenp = 0;
2307 return 0; 2466 return 0;
2308 } 2467 }
2309 2468
2310 i = (unsigned long *) data; 2469 i = (unsigned long *) data;
2311 min = (unsigned long *) table->extra1; 2470 min = (unsigned long *) table->extra1;
2312 max = (unsigned long *) table->extra2; 2471 max = (unsigned long *) table->extra2;
2313 vleft = table->maxlen / sizeof(unsigned long); 2472 vleft = table->maxlen / sizeof(unsigned long);
2314 left = *lenp; 2473 left = *lenp;
2315 2474
2475 if (write) {
2476 if (left > PAGE_SIZE - 1)
2477 left = PAGE_SIZE - 1;
2478 page = __get_free_page(GFP_TEMPORARY);
2479 kbuf = (char *) page;
2480 if (!kbuf)
2481 return -ENOMEM;
2482 if (copy_from_user(kbuf, buffer, left)) {
2483 err = -EFAULT;
2484 goto free;
2485 }
2486 kbuf[left] = 0;
2487 }
2488
2316 for (; left && vleft--; i++, min++, max++, first=0) { 2489 for (; left && vleft--; i++, min++, max++, first=0) {
2490 unsigned long val;
2491
2317 if (write) { 2492 if (write) {
2318 while (left) { 2493 bool neg;
2319 char c; 2494
2320 if (get_user(c, s)) 2495 left -= proc_skip_spaces(&kbuf);
2321 return -EFAULT; 2496
2322 if (!isspace(c)) 2497 err = proc_get_long(&kbuf, &left, &val, &neg,
2323 break; 2498 proc_wspace_sep,
2324 left--; 2499 sizeof(proc_wspace_sep), NULL);
2325 s++; 2500 if (err)
2326 }
2327 if (!left)
2328 break;
2329 neg = 0;
2330 len = left;
2331 if (len > TMPBUFLEN-1)
2332 len = TMPBUFLEN-1;
2333 if (copy_from_user(buf, s, len))
2334 return -EFAULT;
2335 buf[len] = 0;
2336 p = buf;
2337 if (*p == '-' && left > 1) {
2338 neg = 1;
2339 p++;
2340 }
2341 if (*p < '0' || *p > '9')
2342 break;
2343 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2344 len = p-buf;
2345 if ((len < left) && *p && !isspace(*p))
2346 break; 2501 break;
2347 if (neg) 2502 if (neg)
2348 val = -val;
2349 s += len;
2350 left -= len;
2351
2352 if(neg)
2353 continue; 2503 continue;
2354 if ((min && val < *min) || (max && val > *max)) 2504 if ((min && val < *min) || (max && val > *max))
2355 continue; 2505 continue;
2356 *i = val; 2506 *i = val;
2357 } else { 2507 } else {
2358 p = buf; 2508 val = convdiv * (*i) / convmul;
2359 if (!first) 2509 if (!first)
2360 *p++ = '\t'; 2510 err = proc_put_char(&buffer, &left, '\t');
2361 sprintf(p, "%lu", convdiv * (*i) / convmul); 2511 err = proc_put_long(&buffer, &left, val, false);
2362 len = strlen(buf); 2512 if (err)
2363 if (len > left) 2513 break;
2364 len = left;
2365 if(copy_to_user(s, buf, len))
2366 return -EFAULT;
2367 left -= len;
2368 s += len;
2369 } 2514 }
2370 } 2515 }
2371 2516
2372 if (!write && !first && left) { 2517 if (!write && !first && left && !err)
2373 if(put_user('\n', s)) 2518 err = proc_put_char(&buffer, &left, '\n');
2374 return -EFAULT; 2519 if (write && !err)
2375 left--, s++; 2520 left -= proc_skip_spaces(&kbuf);
2376 } 2521free:
2377 if (write) { 2522 if (write) {
2378 while (left) { 2523 free_page(page);
2379 char c; 2524 if (first)
2380 if (get_user(c, s++)) 2525 return err ? : -EINVAL;
2381 return -EFAULT;
2382 if (!isspace(c))
2383 break;
2384 left--;
2385 }
2386 } 2526 }
2387 if (write && first)
2388 return -EINVAL;
2389 *lenp -= left; 2527 *lenp -= left;
2390 *ppos += *lenp; 2528 *ppos += *lenp;
2391 return 0; 2529 return err;
2392#undef TMPBUFLEN
2393} 2530}
2394 2531
2395static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2532static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2450,7 +2587,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2450} 2587}
2451 2588
2452 2589
2453static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2590static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2454 int *valp, 2591 int *valp,
2455 int write, void *data) 2592 int write, void *data)
2456{ 2593{
@@ -2462,10 +2599,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2462 int val = *valp; 2599 int val = *valp;
2463 unsigned long lval; 2600 unsigned long lval;
2464 if (val < 0) { 2601 if (val < 0) {
2465 *negp = -1; 2602 *negp = true;
2466 lval = (unsigned long)-val; 2603 lval = (unsigned long)-val;
2467 } else { 2604 } else {
2468 *negp = 0; 2605 *negp = false;
2469 lval = (unsigned long)val; 2606 lval = (unsigned long)val;
2470 } 2607 }
2471 *lvalp = lval / HZ; 2608 *lvalp = lval / HZ;
@@ -2473,7 +2610,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2473 return 0; 2610 return 0;
2474} 2611}
2475 2612
2476static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2613static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2477 int *valp, 2614 int *valp,
2478 int write, void *data) 2615 int write, void *data)
2479{ 2616{
@@ -2485,10 +2622,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2485 int val = *valp; 2622 int val = *valp;
2486 unsigned long lval; 2623 unsigned long lval;
2487 if (val < 0) { 2624 if (val < 0) {
2488 *negp = -1; 2625 *negp = true;
2489 lval = (unsigned long)-val; 2626 lval = (unsigned long)-val;
2490 } else { 2627 } else {
2491 *negp = 0; 2628 *negp = false;
2492 lval = (unsigned long)val; 2629 lval = (unsigned long)val;
2493 } 2630 }
2494 *lvalp = jiffies_to_clock_t(lval); 2631 *lvalp = jiffies_to_clock_t(lval);
@@ -2496,7 +2633,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2496 return 0; 2633 return 0;
2497} 2634}
2498 2635
2499static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2636static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2500 int *valp, 2637 int *valp,
2501 int write, void *data) 2638 int write, void *data)
2502{ 2639{
@@ -2506,10 +2643,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2506 int val = *valp; 2643 int val = *valp;
2507 unsigned long lval; 2644 unsigned long lval;
2508 if (val < 0) { 2645 if (val < 0) {
2509 *negp = -1; 2646 *negp = true;
2510 lval = (unsigned long)-val; 2647 lval = (unsigned long)-val;
2511 } else { 2648 } else {
2512 *negp = 0; 2649 *negp = false;
2513 lval = (unsigned long)val; 2650 lval = (unsigned long)val;
2514 } 2651 }
2515 *lvalp = jiffies_to_msecs(lval); 2652 *lvalp = jiffies_to_msecs(lval);
@@ -2606,6 +2743,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2606 return 0; 2743 return 0;
2607} 2744}
2608 2745
2746/**
2747 * proc_do_large_bitmap - read/write from/to a large bitmap
2748 * @table: the sysctl table
2749 * @write: %TRUE if this is a write to the sysctl file
2750 * @buffer: the user buffer
2751 * @lenp: the size of the user buffer
2752 * @ppos: file position
2753 *
2754 * The bitmap is stored at table->data and the bitmap length (in bits)
2755 * in table->maxlen.
2756 *
2757 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2758 * large bitmaps may be represented in a compact manner. Writing into
2759 * the file will clear the bitmap then update it with the given input.
2760 *
2761 * Returns 0 on success.
2762 */
2763int proc_do_large_bitmap(struct ctl_table *table, int write,
2764 void __user *buffer, size_t *lenp, loff_t *ppos)
2765{
2766 int err = 0;
2767 bool first = 1;
2768 size_t left = *lenp;
2769 unsigned long bitmap_len = table->maxlen;
2770 unsigned long *bitmap = (unsigned long *) table->data;
2771 unsigned long *tmp_bitmap = NULL;
2772 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2773
2774 if (!bitmap_len || !left || (*ppos && !write)) {
2775 *lenp = 0;
2776 return 0;
2777 }
2778
2779 if (write) {
2780 unsigned long page = 0;
2781 char *kbuf;
2782
2783 if (left > PAGE_SIZE - 1)
2784 left = PAGE_SIZE - 1;
2785
2786 page = __get_free_page(GFP_TEMPORARY);
2787 kbuf = (char *) page;
2788 if (!kbuf)
2789 return -ENOMEM;
2790 if (copy_from_user(kbuf, buffer, left)) {
2791 free_page(page);
2792 return -EFAULT;
2793 }
2794 kbuf[left] = 0;
2795
2796 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2797 GFP_KERNEL);
2798 if (!tmp_bitmap) {
2799 free_page(page);
2800 return -ENOMEM;
2801 }
2802 proc_skip_char(&kbuf, &left, '\n');
2803 while (!err && left) {
2804 unsigned long val_a, val_b;
2805 bool neg;
2806
2807 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2808 sizeof(tr_a), &c);
2809 if (err)
2810 break;
2811 if (val_a >= bitmap_len || neg) {
2812 err = -EINVAL;
2813 break;
2814 }
2815
2816 val_b = val_a;
2817 if (left) {
2818 kbuf++;
2819 left--;
2820 }
2821
2822 if (c == '-') {
2823 err = proc_get_long(&kbuf, &left, &val_b,
2824 &neg, tr_b, sizeof(tr_b),
2825 &c);
2826 if (err)
2827 break;
2828 if (val_b >= bitmap_len || neg ||
2829 val_a > val_b) {
2830 err = -EINVAL;
2831 break;
2832 }
2833 if (left) {
2834 kbuf++;
2835 left--;
2836 }
2837 }
2838
2839 while (val_a <= val_b)
2840 set_bit(val_a++, tmp_bitmap);
2841
2842 first = 0;
2843 proc_skip_char(&kbuf, &left, '\n');
2844 }
2845 free_page(page);
2846 } else {
2847 unsigned long bit_a, bit_b = 0;
2848
2849 while (left) {
2850 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2851 if (bit_a >= bitmap_len)
2852 break;
2853 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2854 bit_a + 1) - 1;
2855
2856 if (!first) {
2857 err = proc_put_char(&buffer, &left, ',');
2858 if (err)
2859 break;
2860 }
2861 err = proc_put_long(&buffer, &left, bit_a, false);
2862 if (err)
2863 break;
2864 if (bit_a != bit_b) {
2865 err = proc_put_char(&buffer, &left, '-');
2866 if (err)
2867 break;
2868 err = proc_put_long(&buffer, &left, bit_b, false);
2869 if (err)
2870 break;
2871 }
2872
2873 first = 0; bit_b++;
2874 }
2875 if (!err)
2876 err = proc_put_char(&buffer, &left, '\n');
2877 }
2878
2879 if (!err) {
2880 if (write) {
2881 if (*ppos)
2882 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2883 else
2884 memcpy(bitmap, tmp_bitmap,
2885 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2886 }
2887 kfree(tmp_bitmap);
2888 *lenp -= left;
2889 *ppos += *lenp;
2890 return 0;
2891 } else {
2892 kfree(tmp_bitmap);
2893 return err;
2894 }
2895}
2896
2609#else /* CONFIG_PROC_FS */ 2897#else /* CONFIG_PROC_FS */
2610 2898
2611int proc_dostring(struct ctl_table *table, int write, 2899int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,8 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
17#include <linux/slab.h>
16 18
17#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
18 20
@@ -223,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 225 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 227 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {} 228 {}
228}; 229};
229 230
@@ -1124,11 +1125,6 @@ out:
1124 return result; 1125 return result;
1125} 1126}
1126 1127
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{ 1130{
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1156 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out; 1153 goto out;
1158 1154
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1160 str += 2; 1157 str += 2;
1161 if (*str == '-') 1158 if (*str == '-')
1162 str++; 1159 str++;
@@ -1331,7 +1328,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1331 ssize_t result; 1328 ssize_t result;
1332 char *pathname; 1329 char *pathname;
1333 int flags; 1330 int flags;
1334 int acc_mode, fmode; 1331 int acc_mode;
1335 1332
1336 pathname = sysctl_getname(name, nlen, &table); 1333 pathname = sysctl_getname(name, nlen, &table);
1337 result = PTR_ERR(pathname); 1334 result = PTR_ERR(pathname);
@@ -1342,15 +1339,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1342 if (oldval && oldlen && newval && newlen) { 1339 if (oldval && oldlen && newval && newlen) {
1343 flags = O_RDWR; 1340 flags = O_RDWR;
1344 acc_mode = MAY_READ | MAY_WRITE; 1341 acc_mode = MAY_READ | MAY_WRITE;
1345 fmode = FMODE_READ | FMODE_WRITE;
1346 } else if (newval && newlen) { 1342 } else if (newval && newlen) {
1347 flags = O_WRONLY; 1343 flags = O_WRONLY;
1348 acc_mode = MAY_WRITE; 1344 acc_mode = MAY_WRITE;
1349 fmode = FMODE_WRITE;
1350 } else if (oldval && oldlen) { 1345 } else if (oldval && oldlen) {
1351 flags = O_RDONLY; 1346 flags = O_RDONLY;
1352 acc_mode = MAY_READ; 1347 acc_mode = MAY_READ;
1353 fmode = FMODE_READ;
1354 } else { 1348 } else {
1355 result = 0; 1349 result = 0;
1356 goto out_putname; 1350 goto out_putname;
@@ -1361,7 +1355,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1361 if (result) 1355 if (result)
1362 goto out_putname; 1356 goto out_putname;
1363 1357
1364 result = may_open(&nd.path, acc_mode, fmode); 1358 result = may_open(&nd.path, acc_mode, flags);
1365 if (result) 1359 if (result)
1366 goto out_putpath; 1360 goto out_putpath;
1367 1361
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/slab.h>
25#include <linux/cgroupstats.h> 26#include <linux/cgroupstats.h>
26#include <linux/cgroup.h> 27#include <linux/cgroup.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
@@ -46,15 +47,13 @@ static struct genl_family family = {
46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 .maxattr = TASKSTATS_CMD_ATTR_MAX,
47}; 48};
48 49
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] 50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
50__read_mostly = {
51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55 55
56static struct nla_policy 56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 57 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59}; 58};
60 59
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h> 38#include <linux/math64.h>
40#include <linux/ptrace.h> 39#include <linux/ptrace.h>
41 40
@@ -133,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
133 */ 132 */
134static inline void warp_clock(void) 133static inline void warp_clock(void)
135{ 134{
136 write_seqlock_irq(&xtime_lock); 135 struct timespec adjust;
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 adjust = current_kernel_time();
139 update_xtime_cache(0); 138 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
140 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
141 clock_was_set();
142} 140}
143 141
144/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
343{ 343{
344 unsigned long flags; 344 unsigned long flags;
345 345
346 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
347 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
348 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
349} 361}
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
441#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
442 454
443/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
444 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
445 */ 469 */
446void clocksource_resume(void) 470void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
449 473
450 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
451 if (cs->resume) 475 if (cs->resume)
452 cs->resume(); 476 cs->resume(cs);
453 477
454 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
455} 479}
@@ -458,8 +482,8 @@ void clocksource_resume(void)
458 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
459 * 483 *
460 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
461 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
462 * 486 * was stopped in code which holds watchdog_lock.
463 */ 487 */
464void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
465{ 489{
@@ -568,6 +592,10 @@ static inline void clocksource_select(void) { }
568 */ 592 */
569static int __init clocksource_done_booting(void) 593static int __init clocksource_done_booting(void)
570{ 594{
595 mutex_lock(&clocksource_mutex);
596 curr_clocksource = clocksource_default_clock();
597 mutex_unlock(&clocksource_mutex);
598
571 finished_booting = 1; 599 finished_booting = 1;
572 600
573 /* 601 /*
@@ -597,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
597 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
598} 626}
599 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
600/** 676/**
601 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
602 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %llu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 (unsigned long long) dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..813993b5fb61 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu(cpu) > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(cpu, ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
164} 182}
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(cpu, ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
231 */ 284 */
232 ts->inidle = 1; 285 ts->inidle = 1;
233 286
234 now = tick_nohz_start_idle(ts); 287 now = tick_nohz_start_idle(cpu, ts);
235 288
236 /* 289 /*
237 * If this cpu is offline and it is the one which updates 290 * If this cpu is offline and it is the one which updates
@@ -272,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
272 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
273 326
274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
275 arch_needs_cpu(cpu)) { 328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
276 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
277 delta_jiffies = 1; 330 delta_jiffies = 1;
278 } else { 331 } else {
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/timecompare.h> 20#include <linux/timecompare.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h>
22#include <linux/math64.h> 23#include <linux/math64.h>
23 24
24/* 25/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -622,6 +611,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
622 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&xtime_lock, flags);
623 612
624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 613 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
614 clocksource_suspend();
625 615
626 return 0; 616 return 0;
627} 617}
@@ -787,7 +777,6 @@ void update_wall_time(void)
787{ 777{
788 struct clocksource *clock; 778 struct clocksource *clock;
789 cycle_t offset; 779 cycle_t offset;
790 u64 nsecs;
791 int shift = 0, maxshift; 780 int shift = 0, maxshift;
792 781
793 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -817,7 +806,8 @@ void update_wall_time(void)
817 shift = min(shift, maxshift); 806 shift = min(shift, maxshift);
818 while (offset >= timekeeper.cycle_interval) { 807 while (offset >= timekeeper.cycle_interval) {
819 offset = logarithmic_accumulation(offset, shift); 808 offset = logarithmic_accumulation(offset, shift);
820 shift--; 809 if(offset < timekeeper.cycle_interval<<shift)
810 shift--;
821 } 811 }
822 812
823 /* correct the clock when NTP error is too big */ 813 /* correct the clock when NTP error is too big */
@@ -845,7 +835,9 @@ void update_wall_time(void)
845 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
846 } 836 }
847 837
848 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
849 * add the remainder to the error difference. 841 * add the remainder to the error difference.
850 */ 842 */
851 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -853,8 +845,15 @@ void update_wall_time(void)
853 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
854 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
855 847
856 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
857 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
858 857
859 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
860 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -880,6 +879,7 @@ void getboottime(struct timespec *ts)
880 879
881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 880 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
882} 881}
882EXPORT_SYMBOL_GPL(getboottime);
883 883
884/** 884/**
885 * monotonic_to_bootbased - Convert the monotonic time to boot based. 885 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,16 +889,17 @@ void monotonic_to_bootbased(struct timespec *ts)
889{ 889{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 890 *ts = timespec_add_safe(*ts, total_sleep_time);
891} 891}
892EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
892 893
893unsigned long get_seconds(void) 894unsigned long get_seconds(void)
894{ 895{
895 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
896} 897}
897EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
898 899
899struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
900{ 901{
901 return xtime_cache; 902 return xtime;
902} 903}
903 904
904struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -909,7 +910,7 @@ struct timespec current_kernel_time(void)
909 do { 910 do {
910 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
911 912
912 now = xtime_cache; 913 now = xtime;
913 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
914 915
915 return now; 916 return now;
@@ -924,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
924 do { 925 do {
925 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
926 927
927 now = xtime_cache; 928 now = xtime;
928 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
929 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
930 931
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
@@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 SEQ_printf(m, " event_handler: "); 229 SEQ_printf(m, " event_handler: ");
229 print_name_offset(m, dev->event_handler); 230 print_name_offset(m, dev->event_handler);
230 SEQ_printf(m, "\n"); 231 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries);
231} 233}
232 234
233static void timer_list_show_tickdevices(struct seq_file *m) 235static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
257 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
258 int cpu; 260 int cpu;
259 261
260 SEQ_printf(m, "Timer List Version: v0.5\n"); 262 SEQ_printf(m, "Timer List Version: v0.6\n");
261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
263 265
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..ee305c8d4e18 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/unistd.h> 45#include <asm/unistd.h>
@@ -318,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
318} 319}
319EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
320 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
321 340
322static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
323 struct timer_list *timer) 342 struct timer_list *timer)
@@ -549,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
549{ 568{
550 timer->entry.next = NULL; 569 timer->entry.next = NULL;
551 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
552#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
553 timer->start_site = NULL; 573 timer->start_site = NULL;
554 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -714,6 +734,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
714} 734}
715EXPORT_SYMBOL(mod_timer_pending); 735EXPORT_SYMBOL(mod_timer_pending);
716 736
737/*
738 * Decide where to put the timer while taking the slack into account
739 *
740 * Algorithm:
741 * 1) calculate the maximum (absolute) time
742 * 2) calculate the highest bit where the expires and new max are different
743 * 3) use this bit to make a mask
744 * 4) use the bitmask to round down the maximum time, so that all last
745 * bits are zeros
746 */
747static inline
748unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749{
750 unsigned long expires_limit, mask;
751 int bit;
752
753 expires_limit = expires;
754
755 if (timer->slack >= 0) {
756 expires_limit = expires + timer->slack;
757 } else {
758 unsigned long now = jiffies;
759
760 /* No slack, if already expired else auto slack 0.4% */
761 if (time_after(expires, now))
762 expires_limit = expires + (expires - now)/256;
763 }
764 mask = expires ^ expires_limit;
765 if (mask == 0)
766 return expires;
767
768 bit = find_last_bit(&mask, BITS_PER_LONG);
769
770 mask = (1 << bit) - 1;
771
772 expires_limit = expires_limit & ~(mask);
773
774 return expires_limit;
775}
776
717/** 777/**
718 * mod_timer - modify a timer's timeout 778 * mod_timer - modify a timer's timeout
719 * @timer: the timer to be modified 779 * @timer: the timer to be modified
@@ -744,6 +804,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
744 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
745 return 1; 805 return 1;
746 806
807 expires = apply_slack(timer, expires);
808
747 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 809 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
748} 810}
749EXPORT_SYMBOL(mod_timer); 811EXPORT_SYMBOL(mod_timer);
@@ -880,6 +942,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
880 if (base->running_timer == timer) 942 if (base->running_timer == timer)
881 goto out; 943 goto out;
882 944
945 timer_stats_timer_clear_start_info(timer);
883 ret = 0; 946 ret = 0;
884 if (timer_pending(timer)) { 947 if (timer_pending(timer)) {
885 detach_timer(timer, 1); 948 detach_timer(timer, 1);
@@ -953,6 +1016,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
953 return index; 1016 return index;
954} 1017}
955 1018
1019static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1020 unsigned long data)
1021{
1022 int preempt_count = preempt_count();
1023
1024#ifdef CONFIG_LOCKDEP
1025 /*
1026 * It is permissible to free the timer from inside the
1027 * function that is called from it, this we need to take into
1028 * account for lockdep too. To avoid bogus "held lock freed"
1029 * warnings as well as problems when looking into
1030 * timer->lockdep_map, make a copy and use that here.
1031 */
1032 struct lockdep_map lockdep_map = timer->lockdep_map;
1033#endif
1034 /*
1035 * Couple the lock chain with the lock chain at
1036 * del_timer_sync() by acquiring the lock_map around the fn()
1037 * call here and in del_timer_sync().
1038 */
1039 lock_map_acquire(&lockdep_map);
1040
1041 trace_timer_expire_entry(timer);
1042 fn(data);
1043 trace_timer_expire_exit(timer);
1044
1045 lock_map_release(&lockdep_map);
1046
1047 if (preempt_count != preempt_count()) {
1048 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1049 fn, preempt_count, preempt_count());
1050 /*
1051 * Restore the preempt count. That gives us a decent
1052 * chance to survive and extract information. If the
1053 * callback kept a lock held, bad luck, but not worse
1054 * than the BUG() we had.
1055 */
1056 preempt_count() = preempt_count;
1057 }
1058}
1059
956#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1060#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
957 1061
958/** 1062/**
@@ -996,45 +1100,7 @@ static inline void __run_timers(struct tvec_base *base)
996 detach_timer(timer, 1); 1100 detach_timer(timer, 1);
997 1101
998 spin_unlock_irq(&base->lock); 1102 spin_unlock_irq(&base->lock);
999 { 1103 call_timer_fn(timer, fn, data);
1000 int preempt_count = preempt_count();
1001
1002#ifdef CONFIG_LOCKDEP
1003 /*
1004 * It is permissible to free the timer from
1005 * inside the function that is called from
1006 * it, this we need to take into account for
1007 * lockdep too. To avoid bogus "held lock
1008 * freed" warnings as well as problems when
1009 * looking into timer->lockdep_map, make a
1010 * copy and use that here.
1011 */
1012 struct lockdep_map lockdep_map =
1013 timer->lockdep_map;
1014#endif
1015 /*
1016 * Couple the lock chain with the lock chain at
1017 * del_timer_sync() by acquiring the lock_map
1018 * around the fn() call here and in
1019 * del_timer_sync().
1020 */
1021 lock_map_acquire(&lockdep_map);
1022
1023 trace_timer_expire_entry(timer);
1024 fn(data);
1025 trace_timer_expire_exit(timer);
1026
1027 lock_map_release(&lockdep_map);
1028
1029 if (preempt_count != preempt_count()) {
1030 printk(KERN_ERR "huh, entered %p "
1031 "with preempt_count %08x, exited"
1032 " with %08x?\n",
1033 fn, preempt_count,
1034 preempt_count());
1035 BUG();
1036 }
1037 }
1038 spin_lock_irq(&base->lock); 1104 spin_lock_irq(&base->lock);
1039 } 1105 }
1040 } 1106 }
@@ -1618,11 +1684,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1618 unsigned long action, void *hcpu) 1684 unsigned long action, void *hcpu)
1619{ 1685{
1620 long cpu = (long)hcpu; 1686 long cpu = (long)hcpu;
1687 int err;
1688
1621 switch(action) { 1689 switch(action) {
1622 case CPU_UP_PREPARE: 1690 case CPU_UP_PREPARE:
1623 case CPU_UP_PREPARE_FROZEN: 1691 case CPU_UP_PREPARE_FROZEN:
1624 if (init_timers_cpu(cpu) < 0) 1692 err = init_timers_cpu(cpu);
1625 return NOTIFY_BAD; 1693 if (err < 0)
1694 return notifier_from_errno(err);
1626 break; 1695 break;
1627#ifdef CONFIG_HOTPLUG_CPU 1696#ifdef CONFIG_HOTPLUG_CPU
1628 case CPU_DEAD: 1697 case CPU_DEAD:
@@ -1648,7 +1717,7 @@ void __init init_timers(void)
1648 1717
1649 init_timer_stats(); 1718 init_timer_stats();
1650 1719
1651 BUG_ON(err == NOTIFY_BAD); 1720 BUG_ON(err != NOTIFY_OK);
1652 register_cpu_notifier(&timers_nb); 1721 register_cpu_notifier(&timers_nb);
1653 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1722 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1654} 1723}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
@@ -46,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
46 help 44 help
47 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER
50 bool
51
52config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
53 bool 48 bool
54 help 49 help
@@ -330,15 +325,6 @@ config BRANCH_TRACER
330 325
331 Say N if unsure. 326 Say N if unsure.
332 327
333config POWER_TRACER
334 bool "Trace power consumption behavior"
335 depends on X86
336 select GENERIC_TRACER
337 help
338 This tracer helps developers to analyze and optimize the kernel's
339 power management decisions, specifically the C-state and P-state
340 behavior.
341
342config KSYM_TRACER 328config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations" 329 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT 330 depends on HAVE_HW_BREAKPOINT
@@ -385,14 +371,6 @@ config STACK_TRACER
385 371
386 Say N if unsure. 372 Say N if unsure.
387 373
388config HW_BRANCH_TRACER
389 depends on HAVE_HW_BRANCH_TRACER
390 bool "Trace hw branches"
391 select GENERIC_TRACER
392 help
393 This tracer records all branches on the system in a circular
394 buffer, giving access to the last N branches for each cpu.
395
396config KMEMTRACE 374config KMEMTRACE
397 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 376 select GENERIC_TRACER
@@ -451,7 +429,7 @@ config BLK_DEV_IO_TRACE
451 429
452config KPROBE_EVENT 430config KPROBE_EVENT
453 depends on KPROBES 431 depends on KPROBES
454 depends on X86 432 depends on HAVE_REGS_AND_STACK_ACCESS_API
455 bool "Enable kprobes-based dynamic events" 433 bool "Enable kprobes-based dynamic events"
456 select TRACING 434 select TRACING
457 default y 435 default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -51,7 +50,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 50obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 51obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 52obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 53ifeq ($(CONFIG_PERF_EVENTS),y)
54obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
55endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 57obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 58obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d9d6206e0b14..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/debugfs.h> 25#include <linux/debugfs.h>
25#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
26#include <linux/time.h> 27#include <linux/time.h>
@@ -540,9 +541,10 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
540 if (ret) 541 if (ret)
541 return ret; 542 return ret;
542 543
543 if (copy_to_user(arg, &buts, sizeof(buts))) 544 if (copy_to_user(arg, &buts, sizeof(buts))) {
545 blk_trace_remove(q);
544 return -EFAULT; 546 return -EFAULT;
545 547 }
546 return 0; 548 return 0;
547} 549}
548EXPORT_SYMBOL_GPL(blk_trace_setup); 550EXPORT_SYMBOL_GPL(blk_trace_setup);
@@ -673,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
673 } 675 }
674} 676}
675 677
676static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 678static void blk_add_trace_rq_abort(void *ignore,
679 struct request_queue *q, struct request *rq)
677{ 680{
678 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 681 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
679} 682}
680 683
681static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 684static void blk_add_trace_rq_insert(void *ignore,
685 struct request_queue *q, struct request *rq)
682{ 686{
683 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 687 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
684} 688}
685 689
686static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 690static void blk_add_trace_rq_issue(void *ignore,
691 struct request_queue *q, struct request *rq)
687{ 692{
688 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 693 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
689} 694}
690 695
691static void blk_add_trace_rq_requeue(struct request_queue *q, 696static void blk_add_trace_rq_requeue(void *ignore,
697 struct request_queue *q,
692 struct request *rq) 698 struct request *rq)
693{ 699{
694 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 700 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
695} 701}
696 702
697static void blk_add_trace_rq_complete(struct request_queue *q, 703static void blk_add_trace_rq_complete(void *ignore,
704 struct request_queue *q,
698 struct request *rq) 705 struct request *rq)
699{ 706{
700 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 707 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -722,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
722 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 729 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
723} 730}
724 731
725static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 732static void blk_add_trace_bio_bounce(void *ignore,
733 struct request_queue *q, struct bio *bio)
726{ 734{
727 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 735 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
728} 736}
729 737
730static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 738static void blk_add_trace_bio_complete(void *ignore,
739 struct request_queue *q, struct bio *bio)
731{ 740{
732 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 741 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
733} 742}
734 743
735static void blk_add_trace_bio_backmerge(struct request_queue *q, 744static void blk_add_trace_bio_backmerge(void *ignore,
745 struct request_queue *q,
736 struct bio *bio) 746 struct bio *bio)
737{ 747{
738 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 748 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
739} 749}
740 750
741static void blk_add_trace_bio_frontmerge(struct request_queue *q, 751static void blk_add_trace_bio_frontmerge(void *ignore,
752 struct request_queue *q,
742 struct bio *bio) 753 struct bio *bio)
743{ 754{
744 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 755 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
745} 756}
746 757
747static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 758static void blk_add_trace_bio_queue(void *ignore,
759 struct request_queue *q, struct bio *bio)
748{ 760{
749 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 761 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
750} 762}
751 763
752static void blk_add_trace_getrq(struct request_queue *q, 764static void blk_add_trace_getrq(void *ignore,
765 struct request_queue *q,
753 struct bio *bio, int rw) 766 struct bio *bio, int rw)
754{ 767{
755 if (bio) 768 if (bio)
@@ -763,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
763} 776}
764 777
765 778
766static void blk_add_trace_sleeprq(struct request_queue *q, 779static void blk_add_trace_sleeprq(void *ignore,
780 struct request_queue *q,
767 struct bio *bio, int rw) 781 struct bio *bio, int rw)
768{ 782{
769 if (bio) 783 if (bio)
@@ -777,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
777 } 791 }
778} 792}
779 793
780static void blk_add_trace_plug(struct request_queue *q) 794static void blk_add_trace_plug(void *ignore, struct request_queue *q)
781{ 795{
782 struct blk_trace *bt = q->blk_trace; 796 struct blk_trace *bt = q->blk_trace;
783 797
@@ -785,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
785 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 799 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
786} 800}
787 801
788static void blk_add_trace_unplug_io(struct request_queue *q) 802static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
789{ 803{
790 struct blk_trace *bt = q->blk_trace; 804 struct blk_trace *bt = q->blk_trace;
791 805
@@ -798,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
798 } 812 }
799} 813}
800 814
801static void blk_add_trace_unplug_timer(struct request_queue *q) 815static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
802{ 816{
803 struct blk_trace *bt = q->blk_trace; 817 struct blk_trace *bt = q->blk_trace;
804 818
@@ -811,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
811 } 825 }
812} 826}
813 827
814static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 828static void blk_add_trace_split(void *ignore,
829 struct request_queue *q, struct bio *bio,
815 unsigned int pdu) 830 unsigned int pdu)
816{ 831{
817 struct blk_trace *bt = q->blk_trace; 832 struct blk_trace *bt = q->blk_trace;
@@ -827,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
827 842
828/** 843/**
829 * blk_add_trace_remap - Add a trace for a remap operation 844 * blk_add_trace_remap - Add a trace for a remap operation
845 * @ignore: trace callback data parameter (not used)
830 * @q: queue the io is for 846 * @q: queue the io is for
831 * @bio: the source bio 847 * @bio: the source bio
832 * @dev: target device 848 * @dev: target device
@@ -837,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
837 * it spans a stripe (or similar). Add a trace for that action. 853 * it spans a stripe (or similar). Add a trace for that action.
838 * 854 *
839 **/ 855 **/
840static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 856static void blk_add_trace_remap(void *ignore,
841 dev_t dev, sector_t from) 857 struct request_queue *q, struct bio *bio,
858 dev_t dev, sector_t from)
842{ 859{
843 struct blk_trace *bt = q->blk_trace; 860 struct blk_trace *bt = q->blk_trace;
844 struct blk_io_trace_remap r; 861 struct blk_io_trace_remap r;
@@ -857,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
857 874
858/** 875/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation 876 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
877 * @ignore: trace callback data parameter (not used)
860 * @q: queue the io is for 878 * @q: queue the io is for
861 * @rq: the source request 879 * @rq: the source request
862 * @dev: target device 880 * @dev: target device
@@ -867,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
867 * Add a trace for that action. 885 * Add a trace for that action.
868 * 886 *
869 **/ 887 **/
870static void blk_add_trace_rq_remap(struct request_queue *q, 888static void blk_add_trace_rq_remap(void *ignore,
889 struct request_queue *q,
871 struct request *rq, dev_t dev, 890 struct request *rq, dev_t dev,
872 sector_t from) 891 sector_t from)
873{ 892{
@@ -919,64 +938,64 @@ static void blk_register_tracepoints(void)
919{ 938{
920 int ret; 939 int ret;
921 940
922 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 941 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
923 WARN_ON(ret); 942 WARN_ON(ret);
924 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 943 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
925 WARN_ON(ret); 944 WARN_ON(ret);
926 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 945 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
927 WARN_ON(ret); 946 WARN_ON(ret);
928 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 947 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
929 WARN_ON(ret); 948 WARN_ON(ret);
930 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 949 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
931 WARN_ON(ret); 950 WARN_ON(ret);
932 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 951 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
933 WARN_ON(ret); 952 WARN_ON(ret);
934 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 953 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
935 WARN_ON(ret); 954 WARN_ON(ret);
936 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 955 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
937 WARN_ON(ret); 956 WARN_ON(ret);
938 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 957 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
939 WARN_ON(ret); 958 WARN_ON(ret);
940 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 959 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
941 WARN_ON(ret); 960 WARN_ON(ret);
942 ret = register_trace_block_getrq(blk_add_trace_getrq); 961 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
943 WARN_ON(ret); 962 WARN_ON(ret);
944 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 963 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
945 WARN_ON(ret); 964 WARN_ON(ret);
946 ret = register_trace_block_plug(blk_add_trace_plug); 965 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
947 WARN_ON(ret); 966 WARN_ON(ret);
948 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 967 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
949 WARN_ON(ret); 968 WARN_ON(ret);
950 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 969 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
951 WARN_ON(ret); 970 WARN_ON(ret);
952 ret = register_trace_block_split(blk_add_trace_split); 971 ret = register_trace_block_split(blk_add_trace_split, NULL);
953 WARN_ON(ret); 972 WARN_ON(ret);
954 ret = register_trace_block_remap(blk_add_trace_remap); 973 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
955 WARN_ON(ret); 974 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 975 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
957 WARN_ON(ret); 976 WARN_ON(ret);
958} 977}
959 978
960static void blk_unregister_tracepoints(void) 979static void blk_unregister_tracepoints(void)
961{ 980{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 981 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
963 unregister_trace_block_remap(blk_add_trace_remap); 982 unregister_trace_block_remap(blk_add_trace_remap, NULL);
964 unregister_trace_block_split(blk_add_trace_split); 983 unregister_trace_block_split(blk_add_trace_split, NULL);
965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 984 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
966 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 985 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
967 unregister_trace_block_plug(blk_add_trace_plug); 986 unregister_trace_block_plug(blk_add_trace_plug, NULL);
968 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 987 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
969 unregister_trace_block_getrq(blk_add_trace_getrq); 988 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
970 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 989 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
971 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 990 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
972 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 991 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
973 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 992 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
974 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 993 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
975 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 994 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
976 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 995 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
977 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 996 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
978 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 997 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
979 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 998 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
980 999
981 tracepoint_synchronize_unregister(); 1000 tracepoint_synchronize_unregister();
982} 1001}
@@ -1319,7 +1338,7 @@ out:
1319} 1338}
1320 1339
1321static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1340static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1322 int flags) 1341 int flags, struct trace_event *event)
1323{ 1342{
1324 return print_one_line(iter, false); 1343 return print_one_line(iter, false);
1325} 1344}
@@ -1341,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1341} 1360}
1342 1361
1343static enum print_line_t 1362static enum print_line_t
1344blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1363blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1364 struct trace_event *event)
1345{ 1365{
1346 return blk_trace_synthesize_old_trace(iter) ? 1366 return blk_trace_synthesize_old_trace(iter) ?
1347 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1367 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1379,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
1379 .set_flag = blk_tracer_set_flag, 1399 .set_flag = blk_tracer_set_flag,
1380}; 1400};
1381 1401
1382static struct trace_event trace_blk_event = { 1402static struct trace_event_functions trace_blk_event_funcs = {
1383 .type = TRACE_BLK,
1384 .trace = blk_trace_event_print, 1403 .trace = blk_trace_event_print,
1385 .binary = blk_trace_event_print_binary, 1404 .binary = blk_trace_event_print_binary,
1386}; 1405};
1387 1406
1407static struct trace_event trace_blk_event = {
1408 .type = TRACE_BLK,
1409 .funcs = &trace_blk_event_funcs,
1410};
1411
1388static int __init init_blk_tracer(void) 1412static int __init init_blk_tracer(void)
1389{ 1413{
1390 if (!register_ftrace_event(&trace_blk_event)) { 1414 if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,12 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/slab.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31#include <linux/rcupdate.h>
31 32
32#include <trace/events/sched.h> 33#include <trace/events/sched.h>
33 34
@@ -85,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 86ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 87ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
87 88
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER 89/*
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); 90 * Traverse the ftrace_list, invoking all entries. The reason that we
90#endif 91 * can use rcu_dereference_raw() is that elements removed from this list
91 92 * are simply leaked, so there is no need to interact with a grace-period
93 * mechanism. The rcu_dereference_raw() calls are needed to handle
94 * concurrent insertions into the ftrace_list.
95 *
96 * Silly Alpha and silly pointer-speculation compiler optimizations!
97 */
92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 98static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
93{ 99{
94 struct ftrace_ops *op = ftrace_list; 100 struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
95
96 /* in case someone actually ports this to alpha! */
97 read_barrier_depends();
98 101
99 while (op != &ftrace_list_end) { 102 while (op != &ftrace_list_end) {
100 /* silly alpha */
101 read_barrier_depends();
102 op->func(ip, parent_ip); 103 op->func(ip, parent_ip);
103 op = op->next; 104 op = rcu_dereference_raw(op->next); /*see above*/
104 }; 105 };
105} 106}
106 107
@@ -155,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 * the ops->next pointer is valid before another CPU sees 156 * the ops->next pointer is valid before another CPU sees
156 * the ops pointer included into the ftrace_list. 157 * the ops pointer included into the ftrace_list.
157 */ 158 */
158 smp_wmb(); 159 rcu_assign_pointer(ftrace_list, ops);
159 ftrace_list = ops;
160 160
161 if (ftrace_enabled) { 161 if (ftrace_enabled) {
162 ftrace_func_t func; 162 ftrace_func_t func;
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
384 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
388 390
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 396 avg = rec->time;
395 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
396 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
397 mutex_lock(&mutex); 411 mutex_lock(&mutex);
398 trace_seq_init(&s); 412 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
403 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
404#endif 420#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 667 goto out;
652 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
653 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
654 674
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 688 }
669 689
670 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 691 if (rec) {
672 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
673 695
674 out: 696 out:
675 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -898,36 +920,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 920 } \
899 } 921 }
900 922
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 923static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 924{
933 rec->freelist = ftrace_free_records; 925 rec->freelist = ftrace_free_records;
@@ -1025,6 +1017,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 1017}
1026 1018
1027 1019
1020/* Return 1 if the address range is reserved for ftrace */
1021int ftrace_text_reserved(void *start, void *end)
1022{
1023 struct dyn_ftrace *rec;
1024 struct ftrace_page *pg;
1025
1026 do_for_each_ftrace_rec(pg, rec) {
1027 if (rec->ip <= (unsigned long)end &&
1028 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1029 return 1;
1030 } while_for_each_ftrace_rec();
1031 return 0;
1032}
1033
1034
1028static int 1035static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1036__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1037{
@@ -1076,14 +1083,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1083 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1084 continue;
1078 1085
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1086 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1087 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1088 rec->flags |= FTRACE_FL_FAILED;
@@ -2300,6 +2299,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
2300 2299
2301#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2300#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2302static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; 2301static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2302static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
2303
2303static int __init set_graph_function(char *str) 2304static int __init set_graph_function(char *str)
2304{ 2305{
2305 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); 2306 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -2426,6 +2427,7 @@ static const struct file_operations ftrace_notrace_fops = {
2426static DEFINE_MUTEX(graph_lock); 2427static DEFINE_MUTEX(graph_lock);
2427 2428
2428int ftrace_graph_count; 2429int ftrace_graph_count;
2430int ftrace_graph_filter_enabled;
2429unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2431unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2430 2432
2431static void * 2433static void *
@@ -2448,7 +2450,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2448 mutex_lock(&graph_lock); 2450 mutex_lock(&graph_lock);
2449 2451
2450 /* Nothing, tell g_show to print all functions are enabled */ 2452 /* Nothing, tell g_show to print all functions are enabled */
2451 if (!ftrace_graph_count && !*pos) 2453 if (!ftrace_graph_filter_enabled && !*pos)
2452 return (void *)1; 2454 return (void *)1;
2453 2455
2454 return __g_next(m, pos); 2456 return __g_next(m, pos);
@@ -2494,6 +2496,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2494 mutex_lock(&graph_lock); 2496 mutex_lock(&graph_lock);
2495 if ((file->f_mode & FMODE_WRITE) && 2497 if ((file->f_mode & FMODE_WRITE) &&
2496 (file->f_flags & O_TRUNC)) { 2498 (file->f_flags & O_TRUNC)) {
2499 ftrace_graph_filter_enabled = 0;
2497 ftrace_graph_count = 0; 2500 ftrace_graph_count = 0;
2498 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2501 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2499 } 2502 }
@@ -2519,7 +2522,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2519 struct dyn_ftrace *rec; 2522 struct dyn_ftrace *rec;
2520 struct ftrace_page *pg; 2523 struct ftrace_page *pg;
2521 int search_len; 2524 int search_len;
2522 int found = 0; 2525 int fail = 1;
2523 int type, not; 2526 int type, not;
2524 char *search; 2527 char *search;
2525 bool exists; 2528 bool exists;
@@ -2530,37 +2533,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2530 2533
2531 /* decode regex */ 2534 /* decode regex */
2532 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2535 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2533 if (not) 2536 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2534 return -EINVAL; 2537 return -EBUSY;
2535 2538
2536 search_len = strlen(search); 2539 search_len = strlen(search);
2537 2540
2538 mutex_lock(&ftrace_lock); 2541 mutex_lock(&ftrace_lock);
2539 do_for_each_ftrace_rec(pg, rec) { 2542 do_for_each_ftrace_rec(pg, rec) {
2540 2543
2541 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2542 break;
2543
2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2544 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2545 continue; 2545 continue;
2546 2546
2547 if (ftrace_match_record(rec, search, search_len, type)) { 2547 if (ftrace_match_record(rec, search, search_len, type)) {
2548 /* ensure it is not already in the array */ 2548 /* if it is in the array */
2549 exists = false; 2549 exists = false;
2550 for (i = 0; i < *idx; i++) 2550 for (i = 0; i < *idx; i++) {
2551 if (array[i] == rec->ip) { 2551 if (array[i] == rec->ip) {
2552 exists = true; 2552 exists = true;
2553 break; 2553 break;
2554 } 2554 }
2555 if (!exists) 2555 }
2556 array[(*idx)++] = rec->ip; 2556
2557 found = 1; 2557 if (!not) {
2558 fail = 0;
2559 if (!exists) {
2560 array[(*idx)++] = rec->ip;
2561 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2562 goto out;
2563 }
2564 } else {
2565 if (exists) {
2566 array[i] = array[--(*idx)];
2567 array[*idx] = 0;
2568 fail = 0;
2569 }
2570 }
2558 } 2571 }
2559 } while_for_each_ftrace_rec(); 2572 } while_for_each_ftrace_rec();
2560 2573out:
2561 mutex_unlock(&ftrace_lock); 2574 mutex_unlock(&ftrace_lock);
2562 2575
2563 return found ? 0 : -EINVAL; 2576 if (fail)
2577 return -EINVAL;
2578
2579 ftrace_graph_filter_enabled = 1;
2580 return 0;
2564} 2581}
2565 2582
2566static ssize_t 2583static ssize_t
@@ -2570,16 +2587,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2570 struct trace_parser parser; 2587 struct trace_parser parser;
2571 ssize_t read, ret; 2588 ssize_t read, ret;
2572 2589
2573 if (!cnt || cnt < 0) 2590 if (!cnt)
2574 return 0; 2591 return 0;
2575 2592
2576 mutex_lock(&graph_lock); 2593 mutex_lock(&graph_lock);
2577 2594
2578 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2579 ret = -EBUSY;
2580 goto out_unlock;
2581 }
2582
2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2595 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2584 ret = -ENOMEM; 2596 ret = -ENOMEM;
2585 goto out_unlock; 2597 goto out_unlock;
@@ -3222,8 +3234,8 @@ free:
3222} 3234}
3223 3235
3224static void 3236static void
3225ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(void *ignore,
3226 struct task_struct *next) 3238 struct task_struct *prev, struct task_struct *next)
3227{ 3239{
3228 unsigned long long timestamp; 3240 unsigned long long timestamp;
3229 int index; 3241 int index;
@@ -3277,7 +3289,7 @@ static int start_graph_tracing(void)
3277 } while (ret == -EAGAIN); 3289 } while (ret == -EAGAIN);
3278 3290
3279 if (!ret) { 3291 if (!ret) {
3280 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3292 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3281 if (ret) 3293 if (ret)
3282 pr_info("ftrace_graph: Couldn't activate tracepoint" 3294 pr_info("ftrace_graph: Couldn't activate tracepoint"
3283 " probe to kernel_sched_switch\n"); 3295 " probe to kernel_sched_switch\n");
@@ -3349,11 +3361,11 @@ void unregister_ftrace_graph(void)
3349 goto out; 3361 goto out;
3350 3362
3351 ftrace_graph_active--; 3363 ftrace_graph_active--;
3352 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3353 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3364 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3354 ftrace_graph_entry = ftrace_graph_entry_stub; 3365 ftrace_graph_entry = ftrace_graph_entry_stub;
3355 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3366 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3356 unregister_pm_notifier(&ftrace_suspend_notifier); 3367 unregister_pm_notifier(&ftrace_suspend_notifier);
3368 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3357 3369
3358 out: 3370 out:
3359 mutex_unlock(&ftrace_lock); 3371 mutex_unlock(&ftrace_lock);
@@ -3364,6 +3376,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3364{ 3376{
3365 /* Make sure we do not use the parent ret_stack */ 3377 /* Make sure we do not use the parent ret_stack */
3366 t->ret_stack = NULL; 3378 t->ret_stack = NULL;
3379 t->curr_ret_stack = -1;
3367 3380
3368 if (ftrace_graph_active) { 3381 if (ftrace_graph_active) {
3369 struct ftrace_ret_stack *ret_stack; 3382 struct ftrace_ret_stack *ret_stack;
@@ -3373,7 +3386,6 @@ void ftrace_graph_init_task(struct task_struct *t)
3373 GFP_KERNEL); 3386 GFP_KERNEL);
3374 if (!ret_stack) 3387 if (!ret_stack)
3375 return; 3388 return;
3376 t->curr_ret_stack = -1;
3377 atomic_set(&t->tracing_graph_pause, 0); 3389 atomic_set(&t->tracing_graph_pause, 0);
3378 atomic_set(&t->trace_overrun, 0); 3390 atomic_set(&t->trace_overrun, 0);
3379 t->ftrace_timestamp = 0; 3391 t->ftrace_timestamp = 0;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
95 trace_wake_up(); 95 trace_wake_up();
96} 96}
97 97
98static void kmemtrace_kmalloc(unsigned long call_site, 98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
99 const void *ptr, 100 const void *ptr,
100 size_t bytes_req, 101 size_t bytes_req,
101 size_t bytes_alloc, 102 size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
105 bytes_req, bytes_alloc, gfp_flags, -1); 106 bytes_req, bytes_alloc, gfp_flags, -1);
106} 107}
107 108
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site, 109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
109 const void *ptr, 111 const void *ptr,
110 size_t bytes_req, 112 size_t bytes_req,
111 size_t bytes_alloc, 113 size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
115 bytes_req, bytes_alloc, gfp_flags, -1); 117 bytes_req, bytes_alloc, gfp_flags, -1);
116} 118}
117 119
118static void kmemtrace_kmalloc_node(unsigned long call_site, 120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
119 const void *ptr, 122 const void *ptr,
120 size_t bytes_req, 123 size_t bytes_req,
121 size_t bytes_alloc, 124 size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
126 bytes_req, bytes_alloc, gfp_flags, node); 129 bytes_req, bytes_alloc, gfp_flags, node);
127} 130}
128 131
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, 132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
130 const void *ptr, 134 const void *ptr,
131 size_t bytes_req, 135 size_t bytes_req,
132 size_t bytes_alloc, 136 size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
137 bytes_req, bytes_alloc, gfp_flags, node); 141 bytes_req, bytes_alloc, gfp_flags, node);
138} 142}
139 143
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr) 144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
141{ 146{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); 147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143} 148}
144 149
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) 150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
146{ 152{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); 153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148} 154}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
151{ 157{
152 int err; 158 int err;
153 159
154 err = register_trace_kmalloc(kmemtrace_kmalloc); 160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
155 if (err) 161 if (err)
156 return err; 162 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
158 if (err) 164 if (err)
159 return err; 165 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); 166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
161 if (err) 167 if (err)
162 return err; 168 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
164 if (err) 170 if (err)
165 return err; 171 return err;
166 err = register_trace_kfree(kmemtrace_kfree); 172 err = register_trace_kfree(kmemtrace_kfree, NULL);
167 if (err) 173 if (err)
168 return err; 174 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
170 176
171 return err; 177 return err;
172} 178}
173 179
174static void kmemtrace_stop_probes(void) 180static void kmemtrace_stop_probes(void)
175{ 181{
176 unregister_trace_kmalloc(kmemtrace_kmalloc); 182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); 184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
180 unregister_trace_kfree(kmemtrace_kfree); 186 unregister_trace_kfree(kmemtrace_kfree, NULL);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
182} 188}
183 189
184static int kmem_trace_init(struct trace_array *tr) 190static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
237}; 243};
238 244
239static enum print_line_t 245static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags) 246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
241{ 248{
242 struct trace_seq *s = &iter->seq; 249 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry; 250 struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
257} 264}
258 265
259static enum print_line_t 266static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags) 267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
261{ 269{
262 struct trace_seq *s = &iter->seq; 270 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry; 271 struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
275} 283}
276 284
277static enum print_line_t 285static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) 286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
279{ 288{
280 struct trace_seq *s = &iter->seq; 289 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry; 290 struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
309} 318}
310 319
311static enum print_line_t 320static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags) 321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
313{ 323{
314 struct trace_seq *s = &iter->seq; 324 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry; 325 struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
463 } 473 }
464} 474}
465 475
466static struct trace_event kmem_trace_alloc = { 476static struct trace_event_functions kmem_trace_alloc_funcs = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc, 477 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user, 478 .binary = kmemtrace_print_alloc_user,
470}; 479};
471 480
472static struct trace_event kmem_trace_free = { 481static struct trace_event kmem_trace_alloc = {
473 .type = TRACE_KMEM_FREE, 482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
474 .trace = kmemtrace_print_free, 487 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user, 488 .binary = kmemtrace_print_free_user,
476}; 489};
477 490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
478static struct tracer kmem_tracer __read_mostly = { 496static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace", 497 .name = "kmemtrace",
480 .init = kmem_trace_init, 498 .init = kmem_trace_init,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
9#include <linux/workqueue.h> 9#include <linux/workqueue.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
13 12
14#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 14#include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..1da7b6ea8b85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22 23
24#include <asm/local.h>
23#include "trace.h" 25#include "trace.h"
24 26
25/* 27/*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 221
@@ -309,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
309#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
310#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
311 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
312struct buffer_data_page { 327struct buffer_data_page {
313 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
314 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -328,6 +343,7 @@ struct buffer_page {
328 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
329 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
330 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
331 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
332}; 348};
333 349
@@ -407,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
407 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
409 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
410 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
411 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
412 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -430,6 +452,8 @@ struct ring_buffer_per_cpu {
430 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
431 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
432 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
433 local_t commit_overrun; 457 local_t commit_overrun;
434 local_t overrun; 458 local_t overrun;
435 local_t entries; 459 local_t entries;
@@ -464,6 +488,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 488 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 489 unsigned long head;
466 struct buffer_page *head_page; 490 struct buffer_page *head_page;
491 struct buffer_page *cache_reader_page;
492 unsigned long cache_read;
467 u64 read_stamp; 493 u64 read_stamp;
468}; 494};
469 495
@@ -1198,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1198 1224
1199 for (i = 0; i < nr_pages; i++) { 1225 for (i = 0; i < nr_pages; i++) {
1200 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1226 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201 return; 1227 goto out;
1202 p = cpu_buffer->pages->next; 1228 p = cpu_buffer->pages->next;
1203 bpage = list_entry(p, struct buffer_page, list); 1229 bpage = list_entry(p, struct buffer_page, list);
1204 list_del_init(&bpage->list); 1230 list_del_init(&bpage->list);
1205 free_buffer_page(bpage); 1231 free_buffer_page(bpage);
1206 } 1232 }
1207 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1233 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208 return; 1234 goto out;
1209 1235
1210 rb_reset_cpu(cpu_buffer); 1236 rb_reset_cpu(cpu_buffer);
1211 rb_check_pages(cpu_buffer); 1237 rb_check_pages(cpu_buffer);
1212 1238
1239out:
1213 spin_unlock_irq(&cpu_buffer->reader_lock); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1214} 1241}
1215 1242
@@ -1226,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1226 1253
1227 for (i = 0; i < nr_pages; i++) { 1254 for (i = 0; i < nr_pages; i++) {
1228 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1255 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1229 return; 1256 goto out;
1230 p = pages->next; 1257 p = pages->next;
1231 bpage = list_entry(p, struct buffer_page, list); 1258 bpage = list_entry(p, struct buffer_page, list);
1232 list_del_init(&bpage->list); 1259 list_del_init(&bpage->list);
@@ -1235,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1235 rb_reset_cpu(cpu_buffer); 1262 rb_reset_cpu(cpu_buffer);
1236 rb_check_pages(cpu_buffer); 1263 rb_check_pages(cpu_buffer);
1237 1264
1265out:
1238 spin_unlock_irq(&cpu_buffer->reader_lock); 1266 spin_unlock_irq(&cpu_buffer->reader_lock);
1239} 1267}
1240 1268
@@ -1544,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event,
1544 1572
1545 case 0: 1573 case 0:
1546 length -= RB_EVNT_HDR_SIZE; 1574 length -= RB_EVNT_HDR_SIZE;
1547 if (length > RB_MAX_SMALL_DATA) 1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1548 event->array[0] = length; 1576 event->array[0] = length;
1549 else 1577 else
1550 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1719,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1719 if (!length) 1747 if (!length)
1720 length = 1; 1748 length = 1;
1721 1749
1722 if (length > RB_MAX_SMALL_DATA) 1750 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1723 length += sizeof(event.array[0]); 1751 length += sizeof(event.array[0]);
1724 1752
1725 length += RB_EVNT_HDR_SIZE; 1753 length += RB_EVNT_HDR_SIZE;
1726 length = ALIGN(length, RB_ALIGNMENT); 1754 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1727 1755
1728 return length; 1756 return length;
1729} 1757}
@@ -1740,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1740 * must fill the old tail_page with padding. 1768 * must fill the old tail_page with padding.
1741 */ 1769 */
1742 if (tail >= BUF_PAGE_SIZE) { 1770 if (tail >= BUF_PAGE_SIZE) {
1771 /*
1772 * If the page was filled, then we still need
1773 * to update the real_end. Reset it to zero
1774 * and the reader will ignore it.
1775 */
1776 if (tail == BUF_PAGE_SIZE)
1777 tail_page->real_end = 0;
1778
1743 local_sub(length, &tail_page->write); 1779 local_sub(length, &tail_page->write);
1744 return; 1780 return;
1745 } 1781 }
@@ -1748,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1748 kmemcheck_annotate_bitfield(event, bitfield); 1784 kmemcheck_annotate_bitfield(event, bitfield);
1749 1785
1750 /* 1786 /*
1787 * Save the original length to the meta data.
1788 * This will be used by the reader to add lost event
1789 * counter.
1790 */
1791 tail_page->real_end = tail;
1792
1793 /*
1751 * If this event is bigger than the minimum size, then 1794 * If this event is bigger than the minimum size, then
1752 * we need to be careful that we don't subtract the 1795 * we need to be careful that we don't subtract the
1753 * write counter enough to allow another writer to slip 1796 * write counter enough to allow another writer to slip
@@ -1965,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1965 u64 *ts, u64 *delta) 2008 u64 *ts, u64 *delta)
1966{ 2009{
1967 struct ring_buffer_event *event; 2010 struct ring_buffer_event *event;
1968 static int once;
1969 int ret; 2011 int ret;
1970 2012
1971 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2013 WARN_ONCE(*delta > (1ULL << 59),
1972 printk(KERN_WARNING "Delta way too big! %llu" 2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1973 " ts=%llu write stamp = %llu\n", 2015 (unsigned long long)*delta,
1974 (unsigned long long)*delta, 2016 (unsigned long long)*ts,
1975 (unsigned long long)*ts, 2017 (unsigned long long)cpu_buffer->write_stamp);
1976 (unsigned long long)cpu_buffer->write_stamp);
1977 WARN_ON(1);
1978 }
1979 2018
1980 /* 2019 /*
1981 * The delta is too big, we to add a 2020 * The delta is too big, we to add a
@@ -2230,12 +2269,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2230 if (ring_buffer_flags != RB_BUFFERS_ON) 2269 if (ring_buffer_flags != RB_BUFFERS_ON)
2231 return NULL; 2270 return NULL;
2232 2271
2233 if (atomic_read(&buffer->record_disabled))
2234 return NULL;
2235
2236 /* If we are tracing schedule, we don't want to recurse */ 2272 /* If we are tracing schedule, we don't want to recurse */
2237 resched = ftrace_preempt_disable(); 2273 resched = ftrace_preempt_disable();
2238 2274
2275 if (atomic_read(&buffer->record_disabled))
2276 goto out_nocheck;
2277
2239 if (trace_recursive_lock()) 2278 if (trace_recursive_lock())
2240 goto out_nocheck; 2279 goto out_nocheck;
2241 2280
@@ -2467,11 +2506,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
2467 if (ring_buffer_flags != RB_BUFFERS_ON) 2506 if (ring_buffer_flags != RB_BUFFERS_ON)
2468 return -EBUSY; 2507 return -EBUSY;
2469 2508
2470 if (atomic_read(&buffer->record_disabled))
2471 return -EBUSY;
2472
2473 resched = ftrace_preempt_disable(); 2509 resched = ftrace_preempt_disable();
2474 2510
2511 if (atomic_read(&buffer->record_disabled))
2512 goto out;
2513
2475 cpu = raw_smp_processor_id(); 2514 cpu = raw_smp_processor_id();
2476 2515
2477 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2516 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2539,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2539 * @buffer: The ring buffer to enable writes 2578 * @buffer: The ring buffer to enable writes
2540 * 2579 *
2541 * Note, multiple disables will need the same number of enables 2580 * Note, multiple disables will need the same number of enables
2542 * to truely enable the writing (much like preempt_disable). 2581 * to truly enable the writing (much like preempt_disable).
2543 */ 2582 */
2544void ring_buffer_record_enable(struct ring_buffer *buffer) 2583void ring_buffer_record_enable(struct ring_buffer *buffer)
2545{ 2584{
@@ -2575,7 +2614,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2575 * @cpu: The CPU to enable. 2614 * @cpu: The CPU to enable.
2576 * 2615 *
2577 * Note, multiple disables will need the same number of enables 2616 * Note, multiple disables will need the same number of enables
2578 * to truely enable the writing (much like preempt_disable). 2617 * to truly enable the writing (much like preempt_disable).
2579 */ 2618 */
2580void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2619void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2581{ 2620{
@@ -2716,6 +2755,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2716 iter->read_stamp = cpu_buffer->read_stamp; 2755 iter->read_stamp = cpu_buffer->read_stamp;
2717 else 2756 else
2718 iter->read_stamp = iter->head_page->page->time_stamp; 2757 iter->read_stamp = iter->head_page->page->time_stamp;
2758 iter->cache_reader_page = cpu_buffer->reader_page;
2759 iter->cache_read = cpu_buffer->read;
2719} 2760}
2720 2761
2721/** 2762/**
@@ -2822,6 +2863,7 @@ static struct buffer_page *
2822rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2863rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2823{ 2864{
2824 struct buffer_page *reader = NULL; 2865 struct buffer_page *reader = NULL;
2866 unsigned long overwrite;
2825 unsigned long flags; 2867 unsigned long flags;
2826 int nr_loops = 0; 2868 int nr_loops = 0;
2827 int ret; 2869 int ret;
@@ -2863,6 +2905,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2863 local_set(&cpu_buffer->reader_page->write, 0); 2905 local_set(&cpu_buffer->reader_page->write, 0);
2864 local_set(&cpu_buffer->reader_page->entries, 0); 2906 local_set(&cpu_buffer->reader_page->entries, 0);
2865 local_set(&cpu_buffer->reader_page->page->commit, 0); 2907 local_set(&cpu_buffer->reader_page->page->commit, 0);
2908 cpu_buffer->reader_page->real_end = 0;
2866 2909
2867 spin: 2910 spin:
2868 /* 2911 /*
@@ -2883,6 +2926,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2883 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2926 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2884 2927
2885 /* 2928 /*
2929 * We want to make sure we read the overruns after we set up our
2930 * pointers to the next object. The writer side does a
2931 * cmpxchg to cross pages which acts as the mb on the writer
2932 * side. Note, the reader will constantly fail the swap
2933 * while the writer is updating the pointers, so this
2934 * guarantees that the overwrite recorded here is the one we
2935 * want to compare with the last_overrun.
2936 */
2937 smp_mb();
2938 overwrite = local_read(&(cpu_buffer->overrun));
2939
2940 /*
2886 * Here's the tricky part. 2941 * Here's the tricky part.
2887 * 2942 *
2888 * We need to move the pointer past the header page. 2943 * We need to move the pointer past the header page.
@@ -2913,6 +2968,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2913 cpu_buffer->reader_page = reader; 2968 cpu_buffer->reader_page = reader;
2914 rb_reset_reader_page(cpu_buffer); 2969 rb_reset_reader_page(cpu_buffer);
2915 2970
2971 if (overwrite != cpu_buffer->last_overrun) {
2972 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2973 cpu_buffer->last_overrun = overwrite;
2974 }
2975
2916 goto again; 2976 goto again;
2917 2977
2918 out: 2978 out:
@@ -2989,8 +3049,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2989 rb_advance_iter(iter); 3049 rb_advance_iter(iter);
2990} 3050}
2991 3051
3052static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3053{
3054 return cpu_buffer->lost_events;
3055}
3056
2992static struct ring_buffer_event * 3057static struct ring_buffer_event *
2993rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3058rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3059 unsigned long *lost_events)
2994{ 3060{
2995 struct ring_buffer_event *event; 3061 struct ring_buffer_event *event;
2996 struct buffer_page *reader; 3062 struct buffer_page *reader;
@@ -3042,6 +3108,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3042 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3108 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3043 cpu_buffer->cpu, ts); 3109 cpu_buffer->cpu, ts);
3044 } 3110 }
3111 if (lost_events)
3112 *lost_events = rb_lost_events(cpu_buffer);
3045 return event; 3113 return event;
3046 3114
3047 default: 3115 default:
@@ -3060,13 +3128,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3060 struct ring_buffer_event *event; 3128 struct ring_buffer_event *event;
3061 int nr_loops = 0; 3129 int nr_loops = 0;
3062 3130
3063 if (ring_buffer_iter_empty(iter))
3064 return NULL;
3065
3066 cpu_buffer = iter->cpu_buffer; 3131 cpu_buffer = iter->cpu_buffer;
3067 buffer = cpu_buffer->buffer; 3132 buffer = cpu_buffer->buffer;
3068 3133
3134 /*
3135 * Check if someone performed a consuming read to
3136 * the buffer. A consuming read invalidates the iterator
3137 * and we need to reset the iterator in this case.
3138 */
3139 if (unlikely(iter->cache_read != cpu_buffer->read ||
3140 iter->cache_reader_page != cpu_buffer->reader_page))
3141 rb_iter_reset(iter);
3142
3069 again: 3143 again:
3144 if (ring_buffer_iter_empty(iter))
3145 return NULL;
3146
3070 /* 3147 /*
3071 * We repeat when a timestamp is encountered. 3148 * We repeat when a timestamp is encountered.
3072 * We can get multiple timestamps by nested interrupts or also 3149 * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3158,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3081 if (rb_per_cpu_empty(cpu_buffer)) 3158 if (rb_per_cpu_empty(cpu_buffer))
3082 return NULL; 3159 return NULL;
3083 3160
3161 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3162 rb_inc_iter(iter);
3163 goto again;
3164 }
3165
3084 event = rb_iter_head_event(iter); 3166 event = rb_iter_head_event(iter);
3085 3167
3086 switch (event->type_len) { 3168 switch (event->type_len) {
@@ -3138,12 +3220,14 @@ static inline int rb_ok_to_lock(void)
3138 * @buffer: The ring buffer to read 3220 * @buffer: The ring buffer to read
3139 * @cpu: The cpu to peak at 3221 * @cpu: The cpu to peak at
3140 * @ts: The timestamp counter of this event. 3222 * @ts: The timestamp counter of this event.
3223 * @lost_events: a variable to store if events were lost (may be NULL)
3141 * 3224 *
3142 * This will return the event that will be read next, but does 3225 * This will return the event that will be read next, but does
3143 * not consume the data. 3226 * not consume the data.
3144 */ 3227 */
3145struct ring_buffer_event * 3228struct ring_buffer_event *
3146ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3229ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3230 unsigned long *lost_events)
3147{ 3231{
3148 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3232 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3149 struct ring_buffer_event *event; 3233 struct ring_buffer_event *event;
@@ -3158,7 +3242,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3158 local_irq_save(flags); 3242 local_irq_save(flags);
3159 if (dolock) 3243 if (dolock)
3160 spin_lock(&cpu_buffer->reader_lock); 3244 spin_lock(&cpu_buffer->reader_lock);
3161 event = rb_buffer_peek(cpu_buffer, ts); 3245 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3162 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3246 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3163 rb_advance_reader(cpu_buffer); 3247 rb_advance_reader(cpu_buffer);
3164 if (dolock) 3248 if (dolock)
@@ -3200,13 +3284,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3200/** 3284/**
3201 * ring_buffer_consume - return an event and consume it 3285 * ring_buffer_consume - return an event and consume it
3202 * @buffer: The ring buffer to get the next event from 3286 * @buffer: The ring buffer to get the next event from
3287 * @cpu: the cpu to read the buffer from
3288 * @ts: a variable to store the timestamp (may be NULL)
3289 * @lost_events: a variable to store if events were lost (may be NULL)
3203 * 3290 *
3204 * Returns the next event in the ring buffer, and that event is consumed. 3291 * Returns the next event in the ring buffer, and that event is consumed.
3205 * Meaning, that sequential reads will keep returning a different event, 3292 * Meaning, that sequential reads will keep returning a different event,
3206 * and eventually empty the ring buffer if the producer is slower. 3293 * and eventually empty the ring buffer if the producer is slower.
3207 */ 3294 */
3208struct ring_buffer_event * 3295struct ring_buffer_event *
3209ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3296ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3297 unsigned long *lost_events)
3210{ 3298{
3211 struct ring_buffer_per_cpu *cpu_buffer; 3299 struct ring_buffer_per_cpu *cpu_buffer;
3212 struct ring_buffer_event *event = NULL; 3300 struct ring_buffer_event *event = NULL;
@@ -3227,9 +3315,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3227 if (dolock) 3315 if (dolock)
3228 spin_lock(&cpu_buffer->reader_lock); 3316 spin_lock(&cpu_buffer->reader_lock);
3229 3317
3230 event = rb_buffer_peek(cpu_buffer, ts); 3318 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3231 if (event) 3319 if (event) {
3320 cpu_buffer->lost_events = 0;
3232 rb_advance_reader(cpu_buffer); 3321 rb_advance_reader(cpu_buffer);
3322 }
3233 3323
3234 if (dolock) 3324 if (dolock)
3235 spin_unlock(&cpu_buffer->reader_lock); 3325 spin_unlock(&cpu_buffer->reader_lock);
@@ -3246,23 +3336,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3246EXPORT_SYMBOL_GPL(ring_buffer_consume); 3336EXPORT_SYMBOL_GPL(ring_buffer_consume);
3247 3337
3248/** 3338/**
3249 * ring_buffer_read_start - start a non consuming read of the buffer 3339 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3250 * @buffer: The ring buffer to read from 3340 * @buffer: The ring buffer to read from
3251 * @cpu: The cpu buffer to iterate over 3341 * @cpu: The cpu buffer to iterate over
3252 * 3342 *
3253 * This starts up an iteration through the buffer. It also disables 3343 * This performs the initial preparations necessary to iterate
3254 * the recording to the buffer until the reading is finished. 3344 * through the buffer. Memory is allocated, buffer recording
3255 * This prevents the reading from being corrupted. This is not 3345 * is disabled, and the iterator pointer is returned to the caller.
3256 * a consuming read, so a producer is not expected.
3257 * 3346 *
3258 * Must be paired with ring_buffer_finish. 3347 * Disabling buffer recordng prevents the reading from being
3348 * corrupted. This is not a consuming read, so a producer is not
3349 * expected.
3350 *
3351 * After a sequence of ring_buffer_read_prepare calls, the user is
3352 * expected to make at least one call to ring_buffer_prepare_sync.
3353 * Afterwards, ring_buffer_read_start is invoked to get things going
3354 * for real.
3355 *
3356 * This overall must be paired with ring_buffer_finish.
3259 */ 3357 */
3260struct ring_buffer_iter * 3358struct ring_buffer_iter *
3261ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3359ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3262{ 3360{
3263 struct ring_buffer_per_cpu *cpu_buffer; 3361 struct ring_buffer_per_cpu *cpu_buffer;
3264 struct ring_buffer_iter *iter; 3362 struct ring_buffer_iter *iter;
3265 unsigned long flags;
3266 3363
3267 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3364 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3268 return NULL; 3365 return NULL;
@@ -3276,15 +3373,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3276 iter->cpu_buffer = cpu_buffer; 3373 iter->cpu_buffer = cpu_buffer;
3277 3374
3278 atomic_inc(&cpu_buffer->record_disabled); 3375 atomic_inc(&cpu_buffer->record_disabled);
3376
3377 return iter;
3378}
3379EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3380
3381/**
3382 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3383 *
3384 * All previously invoked ring_buffer_read_prepare calls to prepare
3385 * iterators will be synchronized. Afterwards, read_buffer_read_start
3386 * calls on those iterators are allowed.
3387 */
3388void
3389ring_buffer_read_prepare_sync(void)
3390{
3279 synchronize_sched(); 3391 synchronize_sched();
3392}
3393EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3394
3395/**
3396 * ring_buffer_read_start - start a non consuming read of the buffer
3397 * @iter: The iterator returned by ring_buffer_read_prepare
3398 *
3399 * This finalizes the startup of an iteration through the buffer.
3400 * The iterator comes from a call to ring_buffer_read_prepare and
3401 * an intervening ring_buffer_read_prepare_sync must have been
3402 * performed.
3403 *
3404 * Must be paired with ring_buffer_finish.
3405 */
3406void
3407ring_buffer_read_start(struct ring_buffer_iter *iter)
3408{
3409 struct ring_buffer_per_cpu *cpu_buffer;
3410 unsigned long flags;
3411
3412 if (!iter)
3413 return;
3414
3415 cpu_buffer = iter->cpu_buffer;
3280 3416
3281 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3417 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3282 arch_spin_lock(&cpu_buffer->lock); 3418 arch_spin_lock(&cpu_buffer->lock);
3283 rb_iter_reset(iter); 3419 rb_iter_reset(iter);
3284 arch_spin_unlock(&cpu_buffer->lock); 3420 arch_spin_unlock(&cpu_buffer->lock);
3285 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3286
3287 return iter;
3288} 3422}
3289EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3423EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3290 3424
@@ -3378,6 +3512,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3378 cpu_buffer->write_stamp = 0; 3512 cpu_buffer->write_stamp = 0;
3379 cpu_buffer->read_stamp = 0; 3513 cpu_buffer->read_stamp = 0;
3380 3514
3515 cpu_buffer->lost_events = 0;
3516 cpu_buffer->last_overrun = 0;
3517
3381 rb_head_page_activate(cpu_buffer); 3518 rb_head_page_activate(cpu_buffer);
3382} 3519}
3383 3520
@@ -3653,6 +3790,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3653 struct ring_buffer_event *event; 3790 struct ring_buffer_event *event;
3654 struct buffer_data_page *bpage; 3791 struct buffer_data_page *bpage;
3655 struct buffer_page *reader; 3792 struct buffer_page *reader;
3793 unsigned long missed_events;
3656 unsigned long flags; 3794 unsigned long flags;
3657 unsigned int commit; 3795 unsigned int commit;
3658 unsigned int read; 3796 unsigned int read;
@@ -3689,6 +3827,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3689 read = reader->read; 3827 read = reader->read;
3690 commit = rb_page_commit(reader); 3828 commit = rb_page_commit(reader);
3691 3829
3830 /* Check if any events were dropped */
3831 missed_events = cpu_buffer->lost_events;
3832
3692 /* 3833 /*
3693 * If this page has been partially read or 3834 * If this page has been partially read or
3694 * if len is not big enough to read the rest of the page or 3835 * if len is not big enough to read the rest of the page or
@@ -3749,9 +3890,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3749 local_set(&reader->entries, 0); 3890 local_set(&reader->entries, 0);
3750 reader->read = 0; 3891 reader->read = 0;
3751 *data_page = bpage; 3892 *data_page = bpage;
3893
3894 /*
3895 * Use the real_end for the data size,
3896 * This gives us a chance to store the lost events
3897 * on the page.
3898 */
3899 if (reader->real_end)
3900 local_set(&bpage->commit, reader->real_end);
3752 } 3901 }
3753 ret = read; 3902 ret = read;
3754 3903
3904 cpu_buffer->lost_events = 0;
3905
3906 commit = local_read(&bpage->commit);
3907 /*
3908 * Set a flag in the commit field if we lost events
3909 */
3910 if (missed_events) {
3911 /* If there is room at the end of the page to save the
3912 * missed events, then record it there.
3913 */
3914 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3915 memcpy(&bpage->data[commit], &missed_events,
3916 sizeof(missed_events));
3917 local_add(RB_MISSED_STORED, &bpage->commit);
3918 commit += sizeof(missed_events);
3919 }
3920 local_add(RB_MISSED_EVENTS, &bpage->commit);
3921 }
3922
3923 /*
3924 * This page may be off to user land. Zero it out here.
3925 */
3926 if (commit < BUF_PAGE_SIZE)
3927 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3928
3755 out_unlock: 3929 out_unlock:
3756 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3930 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3757 3931
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
8#include <linux/kthread.h> 8#include <linux/kthread.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <asm/local.h>
11 12
12struct rb_page { 13struct rb_page {
13 u64 ts; 14 u64 ts;
@@ -80,7 +81,7 @@ static enum event_status read_event(int cpu)
80 int *entry; 81 int *entry;
81 u64 ts; 82 u64 ts;
82 83
83 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
84 if (!event) 85 if (!event)
85 return EVENT_DROPPED; 86 return EVENT_DROPPED;
86 87
@@ -112,7 +113,8 @@ static enum event_status read_page(int cpu)
112 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
113 if (ret >= 0) { 114 if (ret >= 0) {
114 rpage = bpage; 115 rpage = bpage;
115 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
116 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
117 119
118 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..086d36316805 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,10 +32,11 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
36#include <linux/slab.h>
35#include <linux/ctype.h> 37#include <linux/ctype.h>
36#include <linux/init.h> 38#include <linux/init.h>
37#include <linux/poll.h> 39#include <linux/poll.h>
38#include <linux/gfp.h>
39#include <linux/fs.h> 40#include <linux/fs.h>
40 41
41#include "trace.h" 42#include "trace.h"
@@ -91,20 +92,17 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(ftrace_cpu_disabled);
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(ftrace_cpu_disabled);
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -119,9 +117,12 @@ static cpumask_var_t tracing_reader_cpumask;
119 * 117 *
120 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
121 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
122 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
123 */ 123 */
124int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
125 126
126static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
127 128
@@ -141,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
141 142
142static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
143{ 144{
144 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
145 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
146} 156}
147__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
148 158
@@ -243,12 +253,91 @@ static struct tracer *current_trace __read_mostly;
243 253
244/* 254/*
245 * trace_types_lock is used to protect the trace_types list. 255 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 256 */
250static DEFINE_MUTEX(trace_types_lock); 257static DEFINE_MUTEX(trace_types_lock);
251 258
259/*
260 * serialize the access of the ring buffer
261 *
262 * ring buffer serializes readers, but it is low level protection.
263 * The validity of the events (which returns by ring_buffer_peek() ..etc)
264 * are not protected by ring buffer.
265 *
266 * The content of events may become garbage if we allow other process consumes
267 * these events concurrently:
268 * A) the page of the consumed events may become a normal page
269 * (not reader page) in ring buffer, and this page will be rewrited
270 * by events producer.
271 * B) The page of the consumed events may become a page for splice_read,
272 * and this page will be returned to system.
273 *
274 * These primitives allow multi process access to different cpu ring buffer
275 * concurrently.
276 *
277 * These primitives don't distinguish read-only and read-consume access.
278 * Multi read-only access are also serialized.
279 */
280
281#ifdef CONFIG_SMP
282static DECLARE_RWSEM(all_cpu_access_lock);
283static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
284
285static inline void trace_access_lock(int cpu)
286{
287 if (cpu == TRACE_PIPE_ALL_CPU) {
288 /* gain it for accessing the whole ring buffer. */
289 down_write(&all_cpu_access_lock);
290 } else {
291 /* gain it for accessing a cpu ring buffer. */
292
293 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
294 down_read(&all_cpu_access_lock);
295
296 /* Secondly block other access to this @cpu ring buffer. */
297 mutex_lock(&per_cpu(cpu_access_lock, cpu));
298 }
299}
300
301static inline void trace_access_unlock(int cpu)
302{
303 if (cpu == TRACE_PIPE_ALL_CPU) {
304 up_write(&all_cpu_access_lock);
305 } else {
306 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
307 up_read(&all_cpu_access_lock);
308 }
309}
310
311static inline void trace_access_lock_init(void)
312{
313 int cpu;
314
315 for_each_possible_cpu(cpu)
316 mutex_init(&per_cpu(cpu_access_lock, cpu));
317}
318
319#else
320
321static DEFINE_MUTEX(access_lock);
322
323static inline void trace_access_lock(int cpu)
324{
325 (void)cpu;
326 mutex_lock(&access_lock);
327}
328
329static inline void trace_access_unlock(int cpu)
330{
331 (void)cpu;
332 mutex_unlock(&access_lock);
333}
334
335static inline void trace_access_lock_init(void)
336{
337}
338
339#endif
340
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 341/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 342static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 343
@@ -297,6 +386,21 @@ static int __init set_buf_size(char *str)
297} 386}
298__setup("trace_buf_size=", set_buf_size); 387__setup("trace_buf_size=", set_buf_size);
299 388
389static int __init set_tracing_thresh(char *str)
390{
391 unsigned long threshhold;
392 int ret;
393
394 if (!str)
395 return 0;
396 ret = strict_strtoul(str, 0, &threshhold);
397 if (ret < 0)
398 return 0;
399 tracing_thresh = threshhold * 1000;
400 return 1;
401}
402__setup("tracing_thresh=", set_tracing_thresh);
403
300unsigned long nsecs_to_usecs(unsigned long nsecs) 404unsigned long nsecs_to_usecs(unsigned long nsecs)
301{ 405{
302 return nsecs / 1000; 406 return nsecs / 1000;
@@ -502,9 +606,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
502static arch_spinlock_t ftrace_max_lock = 606static arch_spinlock_t ftrace_max_lock =
503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 607 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
504 608
609unsigned long __read_mostly tracing_thresh;
610
505#ifdef CONFIG_TRACER_MAX_TRACE 611#ifdef CONFIG_TRACER_MAX_TRACE
506unsigned long __read_mostly tracing_max_latency; 612unsigned long __read_mostly tracing_max_latency;
507unsigned long __read_mostly tracing_thresh;
508 613
509/* 614/*
510 * Copy the new maximum trace into the separate maximum-trace 615 * Copy the new maximum trace into the separate maximum-trace
@@ -515,7 +620,7 @@ static void
515__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 620__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
516{ 621{
517 struct trace_array_cpu *data = tr->data[cpu]; 622 struct trace_array_cpu *data = tr->data[cpu];
518 struct trace_array_cpu *max_data = tr->data[cpu]; 623 struct trace_array_cpu *max_data;
519 624
520 max_tr.cpu = cpu; 625 max_tr.cpu = cpu;
521 max_tr.time_start = data->preempt_timestamp; 626 max_tr.time_start = data->preempt_timestamp;
@@ -525,7 +630,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
525 max_data->critical_start = data->critical_start; 630 max_data->critical_start = data->critical_start;
526 max_data->critical_end = data->critical_end; 631 max_data->critical_end = data->critical_end;
527 632
528 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 633 memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
529 max_data->pid = tsk->pid; 634 max_data->pid = tsk->pid;
530 max_data->uid = task_uid(tsk); 635 max_data->uid = task_uid(tsk);
531 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 636 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -747,10 +852,10 @@ out:
747 mutex_unlock(&trace_types_lock); 852 mutex_unlock(&trace_types_lock);
748} 853}
749 854
750static void __tracing_reset(struct trace_array *tr, int cpu) 855static void __tracing_reset(struct ring_buffer *buffer, int cpu)
751{ 856{
752 ftrace_disable_cpu(); 857 ftrace_disable_cpu();
753 ring_buffer_reset_cpu(tr->buffer, cpu); 858 ring_buffer_reset_cpu(buffer, cpu);
754 ftrace_enable_cpu(); 859 ftrace_enable_cpu();
755} 860}
756 861
@@ -762,7 +867,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
762 867
763 /* Make sure all commits have finished */ 868 /* Make sure all commits have finished */
764 synchronize_sched(); 869 synchronize_sched();
765 __tracing_reset(tr, cpu); 870 __tracing_reset(buffer, cpu);
766 871
767 ring_buffer_record_enable(buffer); 872 ring_buffer_record_enable(buffer);
768} 873}
@@ -780,7 +885,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
780 tr->time_start = ftrace_now(tr->cpu); 885 tr->time_start = ftrace_now(tr->cpu);
781 886
782 for_each_online_cpu(cpu) 887 for_each_online_cpu(cpu)
783 __tracing_reset(tr, cpu); 888 __tracing_reset(buffer, cpu);
784 889
785 ring_buffer_record_enable(buffer); 890 ring_buffer_record_enable(buffer);
786} 891}
@@ -857,6 +962,8 @@ void tracing_start(void)
857 goto out; 962 goto out;
858 } 963 }
859 964
965 /* Prevent the buffers from switching */
966 arch_spin_lock(&ftrace_max_lock);
860 967
861 buffer = global_trace.buffer; 968 buffer = global_trace.buffer;
862 if (buffer) 969 if (buffer)
@@ -866,6 +973,8 @@ void tracing_start(void)
866 if (buffer) 973 if (buffer)
867 ring_buffer_record_enable(buffer); 974 ring_buffer_record_enable(buffer);
868 975
976 arch_spin_unlock(&ftrace_max_lock);
977
869 ftrace_start(); 978 ftrace_start();
870 out: 979 out:
871 spin_unlock_irqrestore(&tracing_start_lock, flags); 980 spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -887,6 +996,9 @@ void tracing_stop(void)
887 if (trace_stop_count++) 996 if (trace_stop_count++)
888 goto out; 997 goto out;
889 998
999 /* Prevent the buffers from switching */
1000 arch_spin_lock(&ftrace_max_lock);
1001
890 buffer = global_trace.buffer; 1002 buffer = global_trace.buffer;
891 if (buffer) 1003 if (buffer)
892 ring_buffer_record_disable(buffer); 1004 ring_buffer_record_disable(buffer);
@@ -895,6 +1007,8 @@ void tracing_stop(void)
895 if (buffer) 1007 if (buffer)
896 ring_buffer_record_disable(buffer); 1008 ring_buffer_record_disable(buffer);
897 1009
1010 arch_spin_unlock(&ftrace_max_lock);
1011
898 out: 1012 out:
899 spin_unlock_irqrestore(&tracing_start_lock, flags); 1013 spin_unlock_irqrestore(&tracing_start_lock, flags);
900} 1014}
@@ -951,6 +1065,11 @@ void trace_find_cmdline(int pid, char comm[])
951 return; 1065 return;
952 } 1066 }
953 1067
1068 if (WARN_ON_ONCE(pid < 0)) {
1069 strcpy(comm, "<XXX>");
1070 return;
1071 }
1072
954 if (pid > PID_MAX_DEFAULT) { 1073 if (pid > PID_MAX_DEFAULT) {
955 strcpy(comm, "<...>"); 1074 strcpy(comm, "<...>");
956 return; 1075 return;
@@ -1084,7 +1203,7 @@ trace_function(struct trace_array *tr,
1084 struct ftrace_entry *entry; 1203 struct ftrace_entry *entry;
1085 1204
1086 /* If we are reading the ring buffer, don't trace */ 1205 /* If we are reading the ring buffer, don't trace */
1087 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 1206 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
1088 return; 1207 return;
1089 1208
1090 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1209 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1177,6 +1296,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1177 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1296 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1178 return; 1297 return;
1179 1298
1299 /*
1300 * NMIs can not handle page faults, even with fix ups.
1301 * The save user stack can (and often does) fault.
1302 */
1303 if (unlikely(in_nmi()))
1304 return;
1305
1180 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, 1306 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1181 sizeof(*entry), flags, pc); 1307 sizeof(*entry), flags, pc);
1182 if (!event) 1308 if (!event)
@@ -1315,8 +1441,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1315 entry->fmt = fmt; 1441 entry->fmt = fmt;
1316 1442
1317 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1443 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1318 if (!filter_check_discard(call, entry, buffer, event)) 1444 if (!filter_check_discard(call, entry, buffer, event)) {
1319 ring_buffer_unlock_commit(buffer, event); 1445 ring_buffer_unlock_commit(buffer, event);
1446 ftrace_trace_stack(buffer, flags, 6, pc);
1447 }
1320 1448
1321out_unlock: 1449out_unlock:
1322 arch_spin_unlock(&trace_buf_lock); 1450 arch_spin_unlock(&trace_buf_lock);
@@ -1389,8 +1517,10 @@ int trace_array_vprintk(struct trace_array *tr,
1389 1517
1390 memcpy(&entry->buf, trace_buf, len); 1518 memcpy(&entry->buf, trace_buf, len);
1391 entry->buf[len] = '\0'; 1519 entry->buf[len] = '\0';
1392 if (!filter_check_discard(call, entry, buffer, event)) 1520 if (!filter_check_discard(call, entry, buffer, event)) {
1393 ring_buffer_unlock_commit(buffer, event); 1521 ring_buffer_unlock_commit(buffer, event);
1522 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1523 }
1394 1524
1395 out_unlock: 1525 out_unlock:
1396 arch_spin_unlock(&trace_buf_lock); 1526 arch_spin_unlock(&trace_buf_lock);
@@ -1427,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1427} 1557}
1428 1558
1429static struct trace_entry * 1559static struct trace_entry *
1430peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1431{ 1562{
1432 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1433 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1438,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1438 if (buf_iter) 1569 if (buf_iter)
1439 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1440 else 1571 else
1441 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1442 1574
1443 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1444 1576
@@ -1446,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1446} 1578}
1447 1579
1448static struct trace_entry * 1580static struct trace_entry *
1449__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1450{ 1583{
1451 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1452 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1453 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1454 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1455 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1462,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1462 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1463 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1464 return NULL; 1598 return NULL;
1465 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1466 if (ent_cpu) 1600 if (ent_cpu)
1467 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1468 1602
@@ -1474,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1474 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1475 continue; 1609 continue;
1476 1610
1477 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1478 1612
1479 /* 1613 /*
1480 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1483,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1483 next = ent; 1617 next = ent;
1484 next_cpu = cpu; 1618 next_cpu = cpu;
1485 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1486 } 1621 }
1487 } 1622 }
1488 1623
@@ -1492,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1492 if (ent_ts) 1627 if (ent_ts)
1493 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1494 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1495 return next; 1633 return next;
1496} 1634}
1497 1635
@@ -1499,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1499struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1500 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1501{ 1639{
1502 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1503} 1641}
1504 1642
1505/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1506static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1507{ 1645{
1508 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1509 1648
1510 if (iter->ent) 1649 if (iter->ent)
1511 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1517,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1517{ 1656{
1518 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1519 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1520 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1521 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1522} 1662}
1523 1663
@@ -1580,12 +1720,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1580} 1720}
1581 1721
1582/* 1722/*
1583 * No necessary locking here. The worst thing which can
1584 * happen is loosing events consumed at the same time
1585 * by a trace_pipe reader.
1586 * Other than that, we don't risk to crash the ring buffer
1587 * because it serializes the readers.
1588 *
1589 * The current tracer is copied to avoid a global locking 1723 * The current tracer is copied to avoid a global locking
1590 * all around. 1724 * all around.
1591 */ 1725 */
@@ -1623,6 +1757,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1623 1757
1624 ftrace_enable_cpu(); 1758 ftrace_enable_cpu();
1625 1759
1760 iter->leftover = 0;
1626 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1761 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1627 ; 1762 ;
1628 1763
@@ -1640,12 +1775,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1640 } 1775 }
1641 1776
1642 trace_event_read_lock(); 1777 trace_event_read_lock();
1778 trace_access_lock(cpu_file);
1643 return p; 1779 return p;
1644} 1780}
1645 1781
1646static void s_stop(struct seq_file *m, void *p) 1782static void s_stop(struct seq_file *m, void *p)
1647{ 1783{
1784 struct trace_iterator *iter = m->private;
1785
1648 atomic_dec(&trace_record_cmdline_disabled); 1786 atomic_dec(&trace_record_cmdline_disabled);
1787 trace_access_unlock(iter->cpu_file);
1649 trace_event_read_unlock(); 1788 trace_event_read_unlock();
1650} 1789}
1651 1790
@@ -1669,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1669} 1808}
1670 1809
1671 1810
1672static void 1811void
1673print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1674{ 1813{
1675 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1797,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1797 } 1936 }
1798 1937
1799 if (event) 1938 if (event)
1800 return event->trace(iter, sym_flags); 1939 return event->funcs->trace(iter, sym_flags, event);
1801 1940
1802 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1803 goto partial; 1942 goto partial;
@@ -1823,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1823 1962
1824 event = ftrace_find_event(entry->type); 1963 event = ftrace_find_event(entry->type);
1825 if (event) 1964 if (event)
1826 return event->raw(iter, 0); 1965 return event->funcs->raw(iter, 0, event);
1827 1966
1828 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1967 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1829 goto partial; 1968 goto partial;
@@ -1850,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1850 1989
1851 event = ftrace_find_event(entry->type); 1990 event = ftrace_find_event(entry->type);
1852 if (event) { 1991 if (event) {
1853 enum print_line_t ret = event->hex(iter, 0); 1992 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1854 if (ret != TRACE_TYPE_HANDLED) 1993 if (ret != TRACE_TYPE_HANDLED)
1855 return ret; 1994 return ret;
1856 } 1995 }
@@ -1875,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1875 } 2014 }
1876 2015
1877 event = ftrace_find_event(entry->type); 2016 event = ftrace_find_event(entry->type);
1878 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->funcs->binary(iter, 0, event) :
2018 TRACE_TYPE_HANDLED;
1879} 2019}
1880 2020
1881static int trace_empty(struct trace_iterator *iter) 2021int trace_empty(struct trace_iterator *iter)
1882{ 2022{
1883 int cpu; 2023 int cpu;
1884 2024
@@ -1913,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1913{ 2053{
1914 enum print_line_t ret; 2054 enum print_line_t ret;
1915 2055
2056 if (iter->lost_events)
2057 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2058 iter->cpu, iter->lost_events);
2059
1916 if (iter->trace && iter->trace->print_line) { 2060 if (iter->trace && iter->trace->print_line) {
1917 ret = iter->trace->print_line(iter); 2061 ret = iter->trace->print_line(iter);
1918 if (ret != TRACE_TYPE_UNHANDLED) 2062 if (ret != TRACE_TYPE_UNHANDLED)
@@ -1941,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1941 return print_trace_fmt(iter); 2085 return print_trace_fmt(iter);
1942} 2086}
1943 2087
2088void trace_default_header(struct seq_file *m)
2089{
2090 struct trace_iterator *iter = m->private;
2091
2092 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2093 /* print nothing if the buffers are empty */
2094 if (trace_empty(iter))
2095 return;
2096 print_trace_header(m, iter);
2097 if (!(trace_flags & TRACE_ITER_VERBOSE))
2098 print_lat_help_header(m);
2099 } else {
2100 if (!(trace_flags & TRACE_ITER_VERBOSE))
2101 print_func_help_header(m);
2102 }
2103}
2104
1944static int s_show(struct seq_file *m, void *v) 2105static int s_show(struct seq_file *m, void *v)
1945{ 2106{
1946 struct trace_iterator *iter = v; 2107 struct trace_iterator *iter = v;
@@ -1953,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
1953 } 2114 }
1954 if (iter->trace && iter->trace->print_header) 2115 if (iter->trace && iter->trace->print_header)
1955 iter->trace->print_header(m); 2116 iter->trace->print_header(m);
1956 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2117 else
1957 /* print nothing if the buffers are empty */ 2118 trace_default_header(m);
1958 if (trace_empty(iter)) 2119
1959 return 0;
1960 print_trace_header(m, iter);
1961 if (!(trace_flags & TRACE_ITER_VERBOSE))
1962 print_lat_help_header(m);
1963 } else {
1964 if (!(trace_flags & TRACE_ITER_VERBOSE))
1965 print_func_help_header(m);
1966 }
1967 } else if (iter->leftover) { 2120 } else if (iter->leftover) {
1968 /* 2121 /*
1969 * If we filled the seq_file buffer earlier, we 2122 * If we filled the seq_file buffer earlier, we
@@ -2049,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
2049 2202
2050 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2203 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2051 for_each_tracing_cpu(cpu) { 2204 for_each_tracing_cpu(cpu) {
2052
2053 iter->buffer_iter[cpu] = 2205 iter->buffer_iter[cpu] =
2054 ring_buffer_read_start(iter->tr->buffer, cpu); 2206 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2207 }
2208 ring_buffer_read_prepare_sync();
2209 for_each_tracing_cpu(cpu) {
2210 ring_buffer_read_start(iter->buffer_iter[cpu]);
2055 tracing_iter_reset(iter, cpu); 2211 tracing_iter_reset(iter, cpu);
2056 } 2212 }
2057 } else { 2213 } else {
2058 cpu = iter->cpu_file; 2214 cpu = iter->cpu_file;
2059 iter->buffer_iter[cpu] = 2215 iter->buffer_iter[cpu] =
2060 ring_buffer_read_start(iter->tr->buffer, cpu); 2216 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2217 ring_buffer_read_prepare_sync();
2218 ring_buffer_read_start(iter->buffer_iter[cpu]);
2061 tracing_iter_reset(iter, cpu); 2219 tracing_iter_reset(iter, cpu);
2062 } 2220 }
2063 2221
@@ -2836,22 +2994,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2836 2994
2837 mutex_lock(&trace_types_lock); 2995 mutex_lock(&trace_types_lock);
2838 2996
2839 /* We only allow one reader per cpu */
2840 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2841 if (!cpumask_empty(tracing_reader_cpumask)) {
2842 ret = -EBUSY;
2843 goto out;
2844 }
2845 cpumask_setall(tracing_reader_cpumask);
2846 } else {
2847 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2848 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2849 else {
2850 ret = -EBUSY;
2851 goto out;
2852 }
2853 }
2854
2855 /* create a buffer to store the information to pass to userspace */ 2997 /* create a buffer to store the information to pass to userspace */
2856 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2998 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2857 if (!iter) { 2999 if (!iter) {
@@ -2907,12 +3049,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2907 3049
2908 mutex_lock(&trace_types_lock); 3050 mutex_lock(&trace_types_lock);
2909 3051
2910 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2911 cpumask_clear(tracing_reader_cpumask);
2912 else
2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2914
2915
2916 if (iter->trace->pipe_close) 3052 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter); 3053 iter->trace->pipe_close(iter);
2918 3054
@@ -3074,6 +3210,7 @@ waitagain:
3074 iter->pos = -1; 3210 iter->pos = -1;
3075 3211
3076 trace_event_read_lock(); 3212 trace_event_read_lock();
3213 trace_access_lock(iter->cpu_file);
3077 while (find_next_entry_inc(iter) != NULL) { 3214 while (find_next_entry_inc(iter) != NULL) {
3078 enum print_line_t ret; 3215 enum print_line_t ret;
3079 int len = iter->seq.len; 3216 int len = iter->seq.len;
@@ -3090,6 +3227,7 @@ waitagain:
3090 if (iter->seq.len >= cnt) 3227 if (iter->seq.len >= cnt)
3091 break; 3228 break;
3092 } 3229 }
3230 trace_access_unlock(iter->cpu_file);
3093 trace_event_read_unlock(); 3231 trace_event_read_unlock();
3094 3232
3095 /* Now copy what we have to the user */ 3233 /* Now copy what we have to the user */
@@ -3172,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3172 size_t len, 3310 size_t len,
3173 unsigned int flags) 3311 unsigned int flags)
3174{ 3312{
3175 struct page *pages[PIPE_BUFFERS]; 3313 struct page *pages_def[PIPE_DEF_BUFFERS];
3176 struct partial_page partial[PIPE_BUFFERS]; 3314 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3177 struct trace_iterator *iter = filp->private_data; 3315 struct trace_iterator *iter = filp->private_data;
3178 struct splice_pipe_desc spd = { 3316 struct splice_pipe_desc spd = {
3179 .pages = pages, 3317 .pages = pages_def,
3180 .partial = partial, 3318 .partial = partial_def,
3181 .nr_pages = 0, /* This gets updated below. */ 3319 .nr_pages = 0, /* This gets updated below. */
3182 .flags = flags, 3320 .flags = flags,
3183 .ops = &tracing_pipe_buf_ops, 3321 .ops = &tracing_pipe_buf_ops,
@@ -3188,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3188 size_t rem; 3326 size_t rem;
3189 unsigned int i; 3327 unsigned int i;
3190 3328
3329 if (splice_grow_spd(pipe, &spd))
3330 return -ENOMEM;
3331
3191 /* copy the tracer to avoid using a global lock all around */ 3332 /* copy the tracer to avoid using a global lock all around */
3192 mutex_lock(&trace_types_lock); 3333 mutex_lock(&trace_types_lock);
3193 if (unlikely(old_tracer != current_trace && current_trace)) { 3334 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3215,40 +3356,44 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3215 } 3356 }
3216 3357
3217 trace_event_read_lock(); 3358 trace_event_read_lock();
3359 trace_access_lock(iter->cpu_file);
3218 3360
3219 /* Fill as many pages as possible. */ 3361 /* Fill as many pages as possible. */
3220 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3362 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3221 pages[i] = alloc_page(GFP_KERNEL); 3363 spd.pages[i] = alloc_page(GFP_KERNEL);
3222 if (!pages[i]) 3364 if (!spd.pages[i])
3223 break; 3365 break;
3224 3366
3225 rem = tracing_fill_pipe_page(rem, iter); 3367 rem = tracing_fill_pipe_page(rem, iter);
3226 3368
3227 /* Copy the data into the page, so we can start over. */ 3369 /* Copy the data into the page, so we can start over. */
3228 ret = trace_seq_to_buffer(&iter->seq, 3370 ret = trace_seq_to_buffer(&iter->seq,
3229 page_address(pages[i]), 3371 page_address(spd.pages[i]),
3230 iter->seq.len); 3372 iter->seq.len);
3231 if (ret < 0) { 3373 if (ret < 0) {
3232 __free_page(pages[i]); 3374 __free_page(spd.pages[i]);
3233 break; 3375 break;
3234 } 3376 }
3235 partial[i].offset = 0; 3377 spd.partial[i].offset = 0;
3236 partial[i].len = iter->seq.len; 3378 spd.partial[i].len = iter->seq.len;
3237 3379
3238 trace_seq_init(&iter->seq); 3380 trace_seq_init(&iter->seq);
3239 } 3381 }
3240 3382
3383 trace_access_unlock(iter->cpu_file);
3241 trace_event_read_unlock(); 3384 trace_event_read_unlock();
3242 mutex_unlock(&iter->mutex); 3385 mutex_unlock(&iter->mutex);
3243 3386
3244 spd.nr_pages = i; 3387 spd.nr_pages = i;
3245 3388
3246 return splice_to_pipe(pipe, &spd); 3389 ret = splice_to_pipe(pipe, &spd);
3390out:
3391 splice_shrink_spd(pipe, &spd);
3392 return ret;
3247 3393
3248out_err: 3394out_err:
3249 mutex_unlock(&iter->mutex); 3395 mutex_unlock(&iter->mutex);
3250 3396 goto out;
3251 return ret;
3252} 3397}
3253 3398
3254static ssize_t 3399static ssize_t
@@ -3521,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3521 size_t count, loff_t *ppos) 3666 size_t count, loff_t *ppos)
3522{ 3667{
3523 struct ftrace_buffer_info *info = filp->private_data; 3668 struct ftrace_buffer_info *info = filp->private_data;
3524 unsigned int pos;
3525 ssize_t ret; 3669 ssize_t ret;
3526 size_t size; 3670 size_t size;
3527 3671
@@ -3539,18 +3683,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3539 3683
3540 info->read = 0; 3684 info->read = 0;
3541 3685
3686 trace_access_lock(info->cpu);
3542 ret = ring_buffer_read_page(info->tr->buffer, 3687 ret = ring_buffer_read_page(info->tr->buffer,
3543 &info->spare, 3688 &info->spare,
3544 count, 3689 count,
3545 info->cpu, 0); 3690 info->cpu, 0);
3691 trace_access_unlock(info->cpu);
3546 if (ret < 0) 3692 if (ret < 0)
3547 return 0; 3693 return 0;
3548 3694
3549 pos = ring_buffer_page_len(info->spare);
3550
3551 if (pos < PAGE_SIZE)
3552 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3553
3554read: 3695read:
3555 size = PAGE_SIZE - info->read; 3696 size = PAGE_SIZE - info->read;
3556 if (size > count) 3697 if (size > count)
@@ -3645,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3645 unsigned int flags) 3786 unsigned int flags)
3646{ 3787{
3647 struct ftrace_buffer_info *info = file->private_data; 3788 struct ftrace_buffer_info *info = file->private_data;
3648 struct partial_page partial[PIPE_BUFFERS]; 3789 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3649 struct page *pages[PIPE_BUFFERS]; 3790 struct page *pages_def[PIPE_DEF_BUFFERS];
3650 struct splice_pipe_desc spd = { 3791 struct splice_pipe_desc spd = {
3651 .pages = pages, 3792 .pages = pages_def,
3652 .partial = partial, 3793 .partial = partial_def,
3653 .flags = flags, 3794 .flags = flags,
3654 .ops = &buffer_pipe_buf_ops, 3795 .ops = &buffer_pipe_buf_ops,
3655 .spd_release = buffer_spd_release, 3796 .spd_release = buffer_spd_release,
@@ -3658,21 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3658 int entries, size, i; 3799 int entries, size, i;
3659 size_t ret; 3800 size_t ret;
3660 3801
3802 if (splice_grow_spd(pipe, &spd))
3803 return -ENOMEM;
3804
3661 if (*ppos & (PAGE_SIZE - 1)) { 3805 if (*ppos & (PAGE_SIZE - 1)) {
3662 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3806 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3663 return -EINVAL; 3807 ret = -EINVAL;
3808 goto out;
3664 } 3809 }
3665 3810
3666 if (len & (PAGE_SIZE - 1)) { 3811 if (len & (PAGE_SIZE - 1)) {
3667 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3812 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3668 if (len < PAGE_SIZE) 3813 if (len < PAGE_SIZE) {
3669 return -EINVAL; 3814 ret = -EINVAL;
3815 goto out;
3816 }
3670 len &= PAGE_MASK; 3817 len &= PAGE_MASK;
3671 } 3818 }
3672 3819
3820 trace_access_lock(info->cpu);
3673 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3821 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3674 3822
3675 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3823 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3676 struct page *page; 3824 struct page *page;
3677 int r; 3825 int r;
3678 3826
@@ -3717,6 +3865,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3717 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3865 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3718 } 3866 }
3719 3867
3868 trace_access_unlock(info->cpu);
3720 spd.nr_pages = i; 3869 spd.nr_pages = i;
3721 3870
3722 /* did we read anything? */ 3871 /* did we read anything? */
@@ -3726,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3726 else 3875 else
3727 ret = 0; 3876 ret = 0;
3728 /* TODO: block */ 3877 /* TODO: block */
3729 return ret; 3878 goto out;
3730 } 3879 }
3731 3880
3732 ret = splice_to_pipe(pipe, &spd); 3881 ret = splice_to_pipe(pipe, &spd);
3733 3882 splice_shrink_spd(pipe, &spd);
3883out:
3734 return ret; 3884 return ret;
3735} 3885}
3736 3886
@@ -4153,6 +4303,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4303 struct dentry *d_tracer;
4154 int cpu; 4304 int cpu;
4155 4305
4306 trace_access_lock_init();
4307
4156 d_tracer = tracing_init_dentry(); 4308 d_tracer = tracing_init_dentry();
4157 4309
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4310 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4176,10 +4328,10 @@ static __init int tracer_init_debugfs(void)
4176#ifdef CONFIG_TRACER_MAX_TRACE 4328#ifdef CONFIG_TRACER_MAX_TRACE
4177 trace_create_file("tracing_max_latency", 0644, d_tracer, 4329 trace_create_file("tracing_max_latency", 0644, d_tracer,
4178 &tracing_max_latency, &tracing_max_lat_fops); 4330 &tracing_max_latency, &tracing_max_lat_fops);
4331#endif
4179 4332
4180 trace_create_file("tracing_thresh", 0644, d_tracer, 4333 trace_create_file("tracing_thresh", 0644, d_tracer,
4181 &tracing_thresh, &tracing_max_lat_fops); 4334 &tracing_thresh, &tracing_max_lat_fops);
4182#endif
4183 4335
4184 trace_create_file("README", 0444, d_tracer, 4336 trace_create_file("README", 0444, d_tracer,
4185 NULL, &tracing_readme_fops); 4337 NULL, &tracing_readme_fops);
@@ -4219,7 +4371,7 @@ static int trace_panic_handler(struct notifier_block *this,
4219 unsigned long event, void *unused) 4371 unsigned long event, void *unused)
4220{ 4372{
4221 if (ftrace_dump_on_oops) 4373 if (ftrace_dump_on_oops)
4222 ftrace_dump(); 4374 ftrace_dump(ftrace_dump_on_oops);
4223 return NOTIFY_OK; 4375 return NOTIFY_OK;
4224} 4376}
4225 4377
@@ -4236,7 +4388,7 @@ static int trace_die_handler(struct notifier_block *self,
4236 switch (val) { 4388 switch (val) {
4237 case DIE_OOPS: 4389 case DIE_OOPS:
4238 if (ftrace_dump_on_oops) 4390 if (ftrace_dump_on_oops)
4239 ftrace_dump(); 4391 ftrace_dump(ftrace_dump_on_oops);
4240 break; 4392 break;
4241 default: 4393 default:
4242 break; 4394 break;
@@ -4277,7 +4429,8 @@ trace_printk_seq(struct trace_seq *s)
4277 trace_seq_init(s); 4429 trace_seq_init(s);
4278} 4430}
4279 4431
4280static void __ftrace_dump(bool disable_tracing) 4432static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4281{ 4434{
4282 static arch_spinlock_t ftrace_dump_lock = 4435 static arch_spinlock_t ftrace_dump_lock =
4283 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4436 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4310,12 +4463,25 @@ static void __ftrace_dump(bool disable_tracing)
4310 /* don't look at user memory in panic mode */ 4463 /* don't look at user memory in panic mode */
4311 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4464 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4312 4465
4313 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4314
4315 /* Simulate the iterator */ 4466 /* Simulate the iterator */
4316 iter.tr = &global_trace; 4467 iter.tr = &global_trace;
4317 iter.trace = current_trace; 4468 iter.trace = current_trace;
4318 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4469
4470 switch (oops_dump_mode) {
4471 case DUMP_ALL:
4472 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4473 break;
4474 case DUMP_ORIG:
4475 iter.cpu_file = raw_smp_processor_id();
4476 break;
4477 case DUMP_NONE:
4478 goto out_enable;
4479 default:
4480 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4481 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4482 }
4483
4484 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4319 4485
4320 /* 4486 /*
4321 * We need to stop all tracing on all CPUS to read the 4487 * We need to stop all tracing on all CPUS to read the
@@ -4354,6 +4520,7 @@ static void __ftrace_dump(bool disable_tracing)
4354 else 4520 else
4355 printk(KERN_TRACE "---------------------------------\n"); 4521 printk(KERN_TRACE "---------------------------------\n");
4356 4522
4523 out_enable:
4357 /* Re-enable tracing if requested */ 4524 /* Re-enable tracing if requested */
4358 if (!disable_tracing) { 4525 if (!disable_tracing) {
4359 trace_flags |= old_userobj; 4526 trace_flags |= old_userobj;
@@ -4370,9 +4537,9 @@ static void __ftrace_dump(bool disable_tracing)
4370} 4537}
4371 4538
4372/* By default: disable tracing after the dump */ 4539/* By default: disable tracing after the dump */
4373void ftrace_dump(void) 4540void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4374{ 4541{
4375 __ftrace_dump(true); 4542 __ftrace_dump(true, oops_dump_mode);
4376} 4543}
4377 4544
4378__init static int tracer_alloc_buffers(void) 4545__init static int tracer_alloc_buffers(void)
@@ -4387,9 +4554,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4554 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4555 goto out_free_buffer_mask;
4389 4556
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4557 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4558 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4559 ring_buf_size = trace_buf_size;
@@ -4447,8 +4611,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4611 return 0;
4448 4612
4449out_free_cpumask: 4613out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4614 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4615out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4616 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -396,9 +385,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
396 385
397extern unsigned long nsecs_to_usecs(unsigned long nsecs); 386extern unsigned long nsecs_to_usecs(unsigned long nsecs);
398 387
388extern unsigned long tracing_thresh;
389
399#ifdef CONFIG_TRACER_MAX_TRACE 390#ifdef CONFIG_TRACER_MAX_TRACE
400extern unsigned long tracing_max_latency; 391extern unsigned long tracing_max_latency;
401extern unsigned long tracing_thresh;
402 392
403void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 393void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
404void update_max_tr_single(struct trace_array *tr, 394void update_max_tr_single(struct trace_array *tr,
@@ -415,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
415void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
416 int pc); 406 int pc);
417#else 407#else
418static inline void ftrace_trace_stack(struct trace_array *tr, 408static inline void ftrace_trace_stack(struct ring_buffer *buffer,
419 unsigned long flags, int skip, int pc) 409 unsigned long flags, int skip, int pc)
420{ 410{
421} 411}
422 412
423static inline void ftrace_trace_userstack(struct trace_array *tr, 413static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
424 unsigned long flags, int pc) 414 unsigned long flags, int pc)
425{ 415{
426} 416}
@@ -466,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
466 struct trace_array *tr); 456 struct trace_array *tr);
467extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
468 struct trace_array *tr); 458 struct trace_array *tr);
469extern int trace_selftest_startup_hw_branches(struct tracer *trace,
470 struct trace_array *tr);
471extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
472 struct trace_array *tr); 460 struct trace_array *tr);
473#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -490,13 +478,34 @@ extern int trace_clock_id;
490 478
491/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
492#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
493extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
494extern enum print_line_t 493extern enum print_line_t
495trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
496 504
497#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
498/* TODO: make this variable */ 506/* TODO: make this variable */
499#define FTRACE_GRAPH_MAX_FUNCS 32 507#define FTRACE_GRAPH_MAX_FUNCS 32
508extern int ftrace_graph_filter_enabled;
500extern int ftrace_graph_count; 509extern int ftrace_graph_count;
501extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 510extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
502 511
@@ -504,7 +513,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
504{ 513{
505 int i; 514 int i;
506 515
507 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 516 if (!ftrace_graph_filter_enabled)
508 return 1; 517 return 1;
509 518
510 for (i = 0; i < ftrace_graph_count; i++) { 519 for (i = 0; i < ftrace_graph_count; i++) {
@@ -522,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
522#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
523#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
524static inline enum print_line_t 533static inline enum print_line_t
525print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
526{ 535{
527 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
528} 537}
@@ -549,7 +558,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
549 * struct trace_parser - servers for reading the user input separated by spaces 558 * struct trace_parser - servers for reading the user input separated by spaces
550 * @cont: set if the input is not complete - no final space char was found 559 * @cont: set if the input is not complete - no final space char was found
551 * @buffer: holds the parsed user input 560 * @buffer: holds the parsed user input
552 * @idx: user input lenght 561 * @idx: user input length
553 * @size: buffer size 562 * @size: buffer size
554 */ 563 */
555struct trace_parser { 564struct trace_parser {
@@ -769,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
769 struct trace_seq *s); 778 struct trace_seq *s);
770extern int filter_assign_type(const char *type); 779extern int filter_assign_type(const char *type);
771 780
781struct list_head *
782trace_get_fields(struct ftrace_event_call *event_call);
783
772static inline int 784static inline int
773filter_check_discard(struct ftrace_event_call *call, void *rec, 785filter_check_discard(struct ftrace_event_call *call, void *rec,
774 struct ring_buffer *buffer, 786 struct ring_buffer *buffer,
775 struct ring_buffer_event *event) 787 struct ring_buffer_event *event)
776{ 788{
777 if (unlikely(call->filter_active) && 789 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
778 !filter_match_preds(call->filter, rec)) { 790 !filter_match_preds(call->filter, rec)) {
779 ring_buffer_discard_commit(buffer, event); 791 ring_buffer_discard_commit(buffer, event);
780 return 1; 792 return 1;
@@ -791,7 +803,8 @@ extern const char *__stop___trace_bprintk_fmt[];
791 803
792#undef FTRACE_ENTRY 804#undef FTRACE_ENTRY
793#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 805#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
794 extern struct ftrace_event_call event_##call; 806 extern struct ftrace_event_call \
807 __attribute__((__aligned__(4))) event_##call;
795#undef FTRACE_ENTRY_DUP 808#undef FTRACE_ENTRY_DUP
796#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 809#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
797 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 810 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
@@ -307,8 +311,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 311 return -1;
308 if (percent_a > percent_b) 312 if (percent_a > percent_b)
309 return 1; 313 return 1;
310 else 314
311 return 0; 315 if (a->incorrect < b->incorrect)
316 return -1;
317 if (a->incorrect > b->incorrect)
318 return 1;
319
320 /*
321 * Since the above shows worse (incorrect) cases
322 * first, we continue that by showing best (correct)
323 * cases last.
324 */
325 if (a->correct > b->correct)
326 return -1;
327 if (a->correct < b->correct)
328 return 1;
329
330 return 0;
312} 331}
313 332
314static struct tracer_stat annotated_branch_stats = { 333static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
13 * Tracer plugins will chose a default from these clocks. 13 * Tracer plugins will chose a default from these clocks.
14 */ 14 */
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/irqflags.h>
16#include <linux/hardirq.h> 17#include <linux/hardirq.h>
17#include <linux/module.h> 18#include <linux/module.h>
18#include <linux/percpu.h> 19#include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
83 int this_cpu; 84 int this_cpu;
84 u64 now; 85 u64 now;
85 86
86 raw_local_irq_save(flags); 87 local_irq_save(flags);
87 88
88 this_cpu = raw_smp_processor_id(); 89 this_cpu = raw_smp_processor_id();
89 now = cpu_clock(this_cpu); 90 now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
109 arch_spin_unlock(&trace_clock_struct.lock); 110 arch_spin_unlock(&trace_clock_struct.lock);
110 111
111 out: 112 out:
112 raw_local_irq_restore(flags); 113 local_irq_restore(flags);
113 114
114 return now; 115 return now;
115} 116}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
new file mode 100644
index 000000000000..8a2b73f7c068
--- /dev/null
+++ b/kernel/trace/trace_event_perf.c
@@ -0,0 +1,195 @@
1/*
2 * trace event based perf event profiling/tracing
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
6 */
7
8#include <linux/module.h>
9#include <linux/kprobes.h>
10#include "trace.h"
11
12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
13
14static char *perf_trace_buf[4];
15
16/*
17 * Force it to be aligned to unsigned long to avoid misaligned accesses
18 * suprises
19 */
20typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21 perf_trace_t;
22
23/* Count the events in use (per event id, not per instance) */
24static int total_ref_count;
25
26static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event)
28{
29 struct hlist_head *list;
30 int ret = -ENOMEM;
31 int cpu;
32
33 p_event->tp_event = tp_event;
34 if (tp_event->perf_refcount++ > 0)
35 return 0;
36
37 list = alloc_percpu(struct hlist_head);
38 if (!list)
39 goto fail;
40
41 for_each_possible_cpu(cpu)
42 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
43
44 tp_event->perf_events = list;
45
46 if (!total_ref_count) {
47 char *buf;
48 int i;
49
50 for (i = 0; i < 4; i++) {
51 buf = (char *)alloc_percpu(perf_trace_t);
52 if (!buf)
53 goto fail;
54
55 perf_trace_buf[i] = buf;
56 }
57 }
58
59 if (tp_event->class->reg)
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret)
67 goto fail;
68
69 total_ref_count++;
70 return 0;
71
72fail:
73 if (!total_ref_count) {
74 int i;
75
76 for (i = 0; i < 4; i++) {
77 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL;
79 }
80 }
81
82 if (!--tp_event->perf_refcount) {
83 free_percpu(tp_event->perf_events);
84 tp_event->perf_events = NULL;
85 }
86
87 return ret;
88}
89
90int perf_trace_init(struct perf_event *p_event)
91{
92 struct ftrace_event_call *tp_event;
93 int event_id = p_event->attr.config;
94 int ret = -EINVAL;
95
96 mutex_lock(&event_mutex);
97 list_for_each_entry(tp_event, &ftrace_events, list) {
98 if (tp_event->event.type == event_id &&
99 tp_event->class &&
100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event);
104 break;
105 }
106 }
107 mutex_unlock(&event_mutex);
108
109 return ret;
110}
111
112int perf_trace_enable(struct perf_event *p_event)
113{
114 struct ftrace_event_call *tp_event = p_event->tp_event;
115 struct hlist_head *list;
116
117 list = tp_event->perf_events;
118 if (WARN_ON_ONCE(!list))
119 return -EINVAL;
120
121 list = this_cpu_ptr(list);
122 hlist_add_head_rcu(&p_event->hlist_entry, list);
123
124 return 0;
125}
126
127void perf_trace_disable(struct perf_event *p_event)
128{
129 hlist_del_rcu(&p_event->hlist_entry);
130}
131
132void perf_trace_destroy(struct perf_event *p_event)
133{
134 struct ftrace_event_call *tp_event = p_event->tp_event;
135 int i;
136
137 mutex_lock(&event_mutex);
138 if (--tp_event->perf_refcount > 0)
139 goto out;
140
141 if (tp_event->class->reg)
142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147
148 /*
149 * Ensure our callback won't be called anymore. See
150 * tracepoint_probe_unregister() and __DO_TRACE().
151 */
152 synchronize_sched();
153
154 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL;
156
157 if (!--total_ref_count) {
158 for (i = 0; i < 4; i++) {
159 free_percpu(perf_trace_buf[i]);
160 perf_trace_buf[i] = NULL;
161 }
162 }
163out:
164 mutex_unlock(&event_mutex);
165}
166
167__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
168 struct pt_regs *regs, int *rctxp)
169{
170 struct trace_entry *entry;
171 unsigned long flags;
172 char *raw_data;
173 int pc;
174
175 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
176
177 pc = preempt_count();
178
179 *rctxp = perf_swevent_get_recursion_context();
180 if (*rctxp < 0)
181 return NULL;
182
183 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
184
185 /* zero the dead bytes from align to not leak stack to user */
186 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
187
188 entry = (struct trace_entry *)raw_data;
189 local_save_flags(flags);
190 tracing_generic_entry_update(entry, flags, pc);
191 entry->type = type;
192
193 return raw_data;
194}
195EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
deleted file mode 100644
index 9e25573242cf..000000000000
--- a/kernel/trace/trace_event_profile.c
+++ /dev/null
@@ -1,122 +0,0 @@
1/*
2 * trace event based perf counter profiling
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 */
7
8#include <linux/module.h>
9#include "trace.h"
10
11
12char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf);
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19
20/* Count the events in use (per event id, not per instance) */
21static int total_profile_count;
22
23static int ftrace_profile_enable_event(struct ftrace_event_call *event)
24{
25 char *buf;
26 int ret = -ENOMEM;
27
28 if (event->profile_count++ > 0)
29 return 0;
30
31 if (!total_profile_count) {
32 buf = (char *)alloc_percpu(perf_trace_t);
33 if (!buf)
34 goto fail_buf;
35
36 rcu_assign_pointer(perf_trace_buf, buf);
37
38 buf = (char *)alloc_percpu(perf_trace_t);
39 if (!buf)
40 goto fail_buf_nmi;
41
42 rcu_assign_pointer(perf_trace_buf_nmi, buf);
43 }
44
45 ret = event->profile_enable(event);
46 if (!ret) {
47 total_profile_count++;
48 return 0;
49 }
50
51fail_buf_nmi:
52 if (!total_profile_count) {
53 free_percpu(perf_trace_buf_nmi);
54 free_percpu(perf_trace_buf);
55 perf_trace_buf_nmi = NULL;
56 perf_trace_buf = NULL;
57 }
58fail_buf:
59 event->profile_count--;
60
61 return ret;
62}
63
64int ftrace_profile_enable(int event_id)
65{
66 struct ftrace_event_call *event;
67 int ret = -EINVAL;
68
69 mutex_lock(&event_mutex);
70 list_for_each_entry(event, &ftrace_events, list) {
71 if (event->id == event_id && event->profile_enable &&
72 try_module_get(event->mod)) {
73 ret = ftrace_profile_enable_event(event);
74 break;
75 }
76 }
77 mutex_unlock(&event_mutex);
78
79 return ret;
80}
81
82static void ftrace_profile_disable_event(struct ftrace_event_call *event)
83{
84 char *buf, *nmi_buf;
85
86 if (--event->profile_count > 0)
87 return;
88
89 event->profile_disable(event);
90
91 if (!--total_profile_count) {
92 buf = perf_trace_buf;
93 rcu_assign_pointer(perf_trace_buf, NULL);
94
95 nmi_buf = perf_trace_buf_nmi;
96 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
97
98 /*
99 * Ensure every events in profiling have finished before
100 * releasing the buffers
101 */
102 synchronize_sched();
103
104 free_percpu(buf);
105 free_percpu(nmi_buf);
106 }
107}
108
109void ftrace_profile_disable(int event_id)
110{
111 struct ftrace_event_call *event;
112
113 mutex_lock(&event_mutex);
114 list_for_each_entry(event, &ftrace_events, list) {
115 if (event->id == event_id) {
116 ftrace_profile_disable_event(event);
117 module_put(event->mod);
118 break;
119 }
120 }
121 mutex_unlock(&event_mutex);
122}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/slab.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19 20
20#include <asm/setup.h> 21#include <asm/setup.h>
@@ -28,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
28 29
29LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
30 31
32struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call)
34{
35 if (!event_call->class->get_fields)
36 return &event_call->class->fields;
37 return event_call->class->get_fields(event_call);
38}
39
31int trace_define_field(struct ftrace_event_call *call, const char *type, 40int trace_define_field(struct ftrace_event_call *call, const char *type,
32 const char *name, int offset, int size, int is_signed, 41 const char *name, int offset, int size, int is_signed,
33 int filter_type) 42 int filter_type)
34{ 43{
35 struct ftrace_event_field *field; 44 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
36 49
37 field = kzalloc(sizeof(*field), GFP_KERNEL); 50 field = kzalloc(sizeof(*field), GFP_KERNEL);
38 if (!field) 51 if (!field)
@@ -55,15 +68,14 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
55 field->size = size; 68 field->size = size;
56 field->is_signed = is_signed; 69 field->is_signed = is_signed;
57 70
58 list_add(&field->link, &call->fields); 71 head = trace_get_fields(call);
72 list_add(&field->link, head);
59 73
60 return 0; 74 return 0;
61 75
62err: 76err:
63 if (field) { 77 if (field)
64 kfree(field->name); 78 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 79 kfree(field);
68 80
69 return -ENOMEM; 81 return -ENOMEM;
@@ -95,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
95void trace_destroy_fields(struct ftrace_event_call *call) 107void trace_destroy_fields(struct ftrace_event_call *call)
96{ 108{
97 struct ftrace_event_field *field, *next; 109 struct ftrace_event_field *field, *next;
110 struct list_head *head;
98 111
99 list_for_each_entry_safe(field, next, &call->fields, link) { 112 head = trace_get_fields(call);
113 list_for_each_entry_safe(field, next, head, link) {
100 list_del(&field->link); 114 list_del(&field->link);
101 kfree(field->type); 115 kfree(field->type);
102 kfree(field->name); 116 kfree(field->name);
@@ -108,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
108{ 122{
109 int id; 123 int id;
110 124
111 id = register_ftrace_event(call->event); 125 id = register_ftrace_event(&call->event);
112 if (!id) 126 if (!id)
113 return -ENODEV; 127 return -ENODEV;
114 call->id = id;
115 INIT_LIST_HEAD(&call->fields);
116 128
117 return 0; 129 return 0;
118} 130}
@@ -125,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
125 137
126 switch (enable) { 138 switch (enable) {
127 case 0: 139 case 0:
128 if (call->enabled) { 140 if (call->flags & TRACE_EVENT_FL_ENABLED) {
129 call->enabled = 0; 141 call->flags &= ~TRACE_EVENT_FL_ENABLED;
130 tracing_stop_cmdline_record(); 142 tracing_stop_cmdline_record();
131 call->unregfunc(call); 143 if (call->class->reg)
144 call->class->reg(call, TRACE_REG_UNREGISTER);
145 else
146 tracepoint_probe_unregister(call->name,
147 call->class->probe,
148 call);
132 } 149 }
133 break; 150 break;
134 case 1: 151 case 1:
135 if (!call->enabled) { 152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
136 tracing_start_cmdline_record(); 153 tracing_start_cmdline_record();
137 ret = call->regfunc(call); 154 if (call->class->reg)
155 ret = call->class->reg(call, TRACE_REG_REGISTER);
156 else
157 ret = tracepoint_probe_register(call->name,
158 call->class->probe,
159 call);
138 if (ret) { 160 if (ret) {
139 tracing_stop_cmdline_record(); 161 tracing_stop_cmdline_record();
140 pr_info("event trace: Could not enable event " 162 pr_info("event trace: Could not enable event "
141 "%s\n", call->name); 163 "%s\n", call->name);
142 break; 164 break;
143 } 165 }
144 call->enabled = 1; 166 call->flags |= TRACE_EVENT_FL_ENABLED;
145 } 167 }
146 break; 168 break;
147 } 169 }
@@ -172,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
172 mutex_lock(&event_mutex); 194 mutex_lock(&event_mutex);
173 list_for_each_entry(call, &ftrace_events, list) { 195 list_for_each_entry(call, &ftrace_events, list) {
174 196
175 if (!call->name || !call->regfunc) 197 if (!call->name || !call->class ||
198 (!call->class->probe && !call->class->reg))
176 continue; 199 continue;
177 200
178 if (match && 201 if (match &&
179 strcmp(match, call->name) != 0 && 202 strcmp(match, call->name) != 0 &&
180 strcmp(match, call->system) != 0) 203 strcmp(match, call->class->system) != 0)
181 continue; 204 continue;
182 205
183 if (sub && strcmp(sub, call->system) != 0) 206 if (sub && strcmp(sub, call->class->system) != 0)
184 continue; 207 continue;
185 208
186 if (event && strcmp(event, call->name) != 0) 209 if (event && strcmp(event, call->name) != 0)
@@ -298,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
298 * The ftrace subsystem is for showing formats only. 321 * The ftrace subsystem is for showing formats only.
299 * They can not be enabled or disabled via the event files. 322 * They can not be enabled or disabled via the event files.
300 */ 323 */
301 if (call->regfunc) 324 if (call->class && (call->class->probe || call->class->reg))
302 return call; 325 return call;
303 } 326 }
304 327
@@ -329,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
329 (*pos)++; 352 (*pos)++;
330 353
331 list_for_each_entry_continue(call, &ftrace_events, list) { 354 list_for_each_entry_continue(call, &ftrace_events, list) {
332 if (call->enabled) 355 if (call->flags & TRACE_EVENT_FL_ENABLED)
333 return call; 356 return call;
334 } 357 }
335 358
@@ -356,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
356{ 379{
357 struct ftrace_event_call *call = v; 380 struct ftrace_event_call *call = v;
358 381
359 if (strcmp(call->system, TRACE_SYSTEM) != 0) 382 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
360 seq_printf(m, "%s:", call->system); 383 seq_printf(m, "%s:", call->class->system);
361 seq_printf(m, "%s\n", call->name); 384 seq_printf(m, "%s\n", call->name);
362 385
363 return 0; 386 return 0;
@@ -388,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
388 struct ftrace_event_call *call = filp->private_data; 411 struct ftrace_event_call *call = filp->private_data;
389 char *buf; 412 char *buf;
390 413
391 if (call->enabled) 414 if (call->flags & TRACE_EVENT_FL_ENABLED)
392 buf = "1\n"; 415 buf = "1\n";
393 else 416 else
394 buf = "0\n"; 417 buf = "0\n";
@@ -451,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
451 474
452 mutex_lock(&event_mutex); 475 mutex_lock(&event_mutex);
453 list_for_each_entry(call, &ftrace_events, list) { 476 list_for_each_entry(call, &ftrace_events, list) {
454 if (!call->name || !call->regfunc) 477 if (!call->name || !call->class ||
478 (!call->class->probe && !call->class->reg))
455 continue; 479 continue;
456 480
457 if (system && strcmp(call->system, system) != 0) 481 if (system && strcmp(call->class->system, system) != 0)
458 continue; 482 continue;
459 483
460 /* 484 /*
@@ -462,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
462 * or if all events or cleared, or if we have 486 * or if all events or cleared, or if we have
463 * a mixture. 487 * a mixture.
464 */ 488 */
465 set |= (1 << !!call->enabled); 489 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
466 490
467 /* 491 /*
468 * If we have a mixture, no need to look further. 492 * If we have a mixture, no need to look further.
@@ -520,41 +544,17 @@ out:
520 return ret; 544 return ret;
521} 545}
522 546
523extern char *__bad_type_size(void);
524
525#undef FIELD
526#define FIELD(type, name) \
527 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
528 #type, "common_" #name, offsetof(typeof(field), name), \
529 sizeof(field.name), is_signed_type(type)
530
531static int trace_write_header(struct trace_seq *s)
532{
533 struct trace_entry field;
534
535 /* struct trace_entry */
536 return trace_seq_printf(s,
537 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
538 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
539 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
540 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
541 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
542 "\n",
543 FIELD(unsigned short, type),
544 FIELD(unsigned char, flags),
545 FIELD(unsigned char, preempt_count),
546 FIELD(int, pid),
547 FIELD(int, lock_depth));
548}
549
550static ssize_t 547static ssize_t
551event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 548event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
552 loff_t *ppos) 549 loff_t *ppos)
553{ 550{
554 struct ftrace_event_call *call = filp->private_data; 551 struct ftrace_event_call *call = filp->private_data;
552 struct ftrace_event_field *field;
553 struct list_head *head;
555 struct trace_seq *s; 554 struct trace_seq *s;
555 int common_field_count = 5;
556 char *buf; 556 char *buf;
557 int r; 557 int r = 0;
558 558
559 if (*ppos) 559 if (*ppos)
560 return 0; 560 return 0;
@@ -565,14 +565,49 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
565 565
566 trace_seq_init(s); 566 trace_seq_init(s);
567 567
568 /* If any of the first writes fail, so will the show_format. */
569
570 trace_seq_printf(s, "name: %s\n", call->name); 568 trace_seq_printf(s, "name: %s\n", call->name);
571 trace_seq_printf(s, "ID: %d\n", call->id); 569 trace_seq_printf(s, "ID: %d\n", call->event.type);
572 trace_seq_printf(s, "format:\n"); 570 trace_seq_printf(s, "format:\n");
573 trace_write_header(s);
574 571
575 r = call->show_format(call, s); 572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
574 /*
575 * Smartly shows the array type(except dynamic array).
576 * Normal:
577 * field:TYPE VAR
578 * If TYPE := TYPE[LEN], it is shown:
579 * field:TYPE VAR[LEN]
580 */
581 const char *array_descriptor = strchr(field->type, '[');
582
583 if (!strncmp(field->type, "__data_loc", 10))
584 array_descriptor = NULL;
585
586 if (!array_descriptor) {
587 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
588 "\tsize:%u;\tsigned:%d;\n",
589 field->type, field->name, field->offset,
590 field->size, !!field->is_signed);
591 } else {
592 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
593 "\tsize:%u;\tsigned:%d;\n",
594 (int)(array_descriptor - field->type),
595 field->type, field->name,
596 array_descriptor, field->offset,
597 field->size, !!field->is_signed);
598 }
599
600 if (--common_field_count == 0)
601 r = trace_seq_printf(s, "\n");
602
603 if (!r)
604 break;
605 }
606
607 if (r)
608 r = trace_seq_printf(s, "\nprint fmt: %s\n",
609 call->print_fmt);
610
576 if (!r) { 611 if (!r) {
577 /* 612 /*
578 * ug! The format output is bigger than a PAGE!! 613 * ug! The format output is bigger than a PAGE!!
@@ -605,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
605 return -ENOMEM; 640 return -ENOMEM;
606 641
607 trace_seq_init(s); 642 trace_seq_init(s);
608 trace_seq_printf(s, "%d\n", call->id); 643 trace_seq_printf(s, "%d\n", call->event.type);
609 644
610 r = simple_read_from_buffer(ubuf, cnt, ppos, 645 r = simple_read_from_buffer(ubuf, cnt, ppos,
611 s->buffer, s->len); 646 s->buffer, s->len);
@@ -911,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
911 const struct file_operations *filter, 946 const struct file_operations *filter,
912 const struct file_operations *format) 947 const struct file_operations *format)
913{ 948{
949 struct list_head *head;
914 int ret; 950 int ret;
915 951
916 /* 952 /*
917 * If the trace point header did not define TRACE_SYSTEM 953 * If the trace point header did not define TRACE_SYSTEM
918 * then the system would be called "TRACE_SYSTEM". 954 * then the system would be called "TRACE_SYSTEM".
919 */ 955 */
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 956 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 957 d_events = event_subsystem_dir(call->class->system, d_events);
922 958
923 call->dir = debugfs_create_dir(call->name, d_events); 959 call->dir = debugfs_create_dir(call->name, d_events);
924 if (!call->dir) { 960 if (!call->dir) {
@@ -927,31 +963,36 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
927 return -1; 963 return -1;
928 } 964 }
929 965
930 if (call->regfunc) 966 if (call->class->probe || call->class->reg)
931 trace_create_file("enable", 0644, call->dir, call, 967 trace_create_file("enable", 0644, call->dir, call,
932 enable); 968 enable);
933 969
934 if (call->id && call->profile_enable) 970#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg))
935 trace_create_file("id", 0444, call->dir, call, 972 trace_create_file("id", 0444, call->dir, call,
936 id); 973 id);
974#endif
937 975
938 if (call->define_fields) { 976 if (call->class->define_fields) {
939 ret = trace_define_common_fields(call); 977 /*
940 if (!ret) 978 * Other events may have the same class. Only update
941 ret = call->define_fields(call); 979 * the fields if they are not already defined.
942 if (ret < 0) { 980 */
943 pr_warning("Could not initialize trace point" 981 head = trace_get_fields(call);
944 " events/%s\n", call->name); 982 if (list_empty(head)) {
945 return ret; 983 ret = trace_define_common_fields(call);
984 if (!ret)
985 ret = call->class->define_fields(call);
986 if (ret < 0) {
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
946 } 991 }
947 trace_create_file("filter", 0644, call->dir, call, 992 trace_create_file("filter", 0644, call->dir, call,
948 filter); 993 filter);
949 } 994 }
950 995
951 /* A trace may not want to export its format */
952 if (!call->show_format)
953 return 0;
954
955 trace_create_file("format", 0444, call->dir, call, 996 trace_create_file("format", 0444, call->dir, call,
956 format); 997 format);
957 998
@@ -966,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
966 if (!call->name) 1007 if (!call->name)
967 return -EINVAL; 1008 return -EINVAL;
968 1009
969 if (call->raw_init) { 1010 if (call->class->raw_init) {
970 ret = call->raw_init(call); 1011 ret = call->class->raw_init(call);
971 if (ret < 0) { 1012 if (ret < 0) {
972 if (ret != -ENOSYS) 1013 if (ret != -ENOSYS)
973 pr_warning("Could not initialize trace " 1014 pr_warning("Could not initialize trace "
@@ -1031,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
1031static void __trace_remove_event_call(struct ftrace_event_call *call) 1072static void __trace_remove_event_call(struct ftrace_event_call *call)
1032{ 1073{
1033 ftrace_event_enable_disable(call, 0); 1074 ftrace_event_enable_disable(call, 0);
1034 if (call->event) 1075 if (call->event.funcs)
1035 __unregister_ftrace_event(call->event); 1076 __unregister_ftrace_event(&call->event);
1036 debugfs_remove_recursive(call->dir); 1077 debugfs_remove_recursive(call->dir);
1037 list_del(&call->list); 1078 list_del(&call->list);
1038 trace_destroy_fields(call); 1079 trace_destroy_fields(call);
1039 destroy_preds(call); 1080 destroy_preds(call);
1040 remove_subsystem_dir(call->system); 1081 remove_subsystem_dir(call->class->system);
1041} 1082}
1042 1083
1043/* Remove an event_call */ 1084/* Remove an event_call */
@@ -1128,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
1128 /* The linker may leave blanks */ 1169 /* The linker may leave blanks */
1129 if (!call->name) 1170 if (!call->name)
1130 continue; 1171 continue;
1131 if (call->raw_init) { 1172 if (call->class->raw_init) {
1132 ret = call->raw_init(call); 1173 ret = call->class->raw_init(call);
1133 if (ret < 0) { 1174 if (ret < 0) {
1134 if (ret != -ENOSYS) 1175 if (ret != -ENOSYS)
1135 pr_warning("Could not initialize trace " 1176 pr_warning("Could not initialize trace "
@@ -1282,8 +1323,8 @@ static __init int event_trace_init(void)
1282 /* The linker may leave blanks */ 1323 /* The linker may leave blanks */
1283 if (!call->name) 1324 if (!call->name)
1284 continue; 1325 continue;
1285 if (call->raw_init) { 1326 if (call->class->raw_init) {
1286 ret = call->raw_init(call); 1327 ret = call->class->raw_init(call);
1287 if (ret < 0) { 1328 if (ret < 0) {
1288 if (ret != -ENOSYS) 1329 if (ret != -ENOSYS)
1289 pr_warning("Could not initialize trace " 1330 pr_warning("Could not initialize trace "
@@ -1384,8 +1425,8 @@ static __init void event_trace_self_tests(void)
1384 1425
1385 list_for_each_entry(call, &ftrace_events, list) { 1426 list_for_each_entry(call, &ftrace_events, list) {
1386 1427
1387 /* Only test those that have a regfunc */ 1428 /* Only test those that have a probe */
1388 if (!call->regfunc) 1429 if (!call->class || !call->class->probe)
1389 continue; 1430 continue;
1390 1431
1391/* 1432/*
@@ -1395,8 +1436,8 @@ static __init void event_trace_self_tests(void)
1395 * syscalls as we test. 1436 * syscalls as we test.
1396 */ 1437 */
1397#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1438#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1398 if (call->system && 1439 if (call->class->system &&
1399 strcmp(call->system, "syscalls") == 0) 1440 strcmp(call->class->system, "syscalls") == 0)
1400 continue; 1441 continue;
1401#endif 1442#endif
1402 1443
@@ -1406,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1406 * If an event is already enabled, someone is using 1447 * If an event is already enabled, someone is using
1407 * it and the self test should not be on. 1448 * it and the self test should not be on.
1408 */ 1449 */
1409 if (call->enabled) { 1450 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1410 pr_warning("Enabled event during self test!\n"); 1451 pr_warning("Enabled event during self test!\n");
1411 WARN_ON_ONCE(1); 1452 WARN_ON_ONCE(1);
1412 continue; 1453 continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/slab.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
@@ -499,8 +500,10 @@ static struct ftrace_event_field *
499find_event_field(struct ftrace_event_call *call, char *name) 500find_event_field(struct ftrace_event_call *call, char *name)
500{ 501{
501 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
502 504
503 list_for_each_entry(field, &call->fields, link) { 505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) {
504 if (!strcmp(field->name, name)) 507 if (!strcmp(field->name, name))
505 return field; 508 return field;
506 } 509 }
@@ -544,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
544 struct event_filter *filter = call->filter; 547 struct event_filter *filter = call->filter;
545 int i; 548 int i;
546 549
547 call->filter_active = 0; 550 call->flags &= ~TRACE_EVENT_FL_FILTERED;
548 filter->n_preds = 0; 551 filter->n_preds = 0;
549 552
550 for (i = 0; i < MAX_FILTER_PRED; i++) 553 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -571,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
571{ 574{
572 __free_preds(call->filter); 575 __free_preds(call->filter);
573 call->filter = NULL; 576 call->filter = NULL;
574 call->filter_active = 0; 577 call->flags &= ~TRACE_EVENT_FL_FILTERED;
575} 578}
576 579
577static struct event_filter *__alloc_preds(void) 580static struct event_filter *__alloc_preds(void)
@@ -610,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
610 if (call->filter) 613 if (call->filter)
611 return 0; 614 return 0;
612 615
613 call->filter_active = 0; 616 call->flags &= ~TRACE_EVENT_FL_FILTERED;
614 call->filter = __alloc_preds(); 617 call->filter = __alloc_preds();
615 if (IS_ERR(call->filter)) 618 if (IS_ERR(call->filter))
616 return PTR_ERR(call->filter); 619 return PTR_ERR(call->filter);
@@ -624,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
624 int err; 627 int err;
625 628
626 list_for_each_entry(call, &ftrace_events, list) { 629 list_for_each_entry(call, &ftrace_events, list) {
627 if (!call->define_fields) 630 if (!call->class || !call->class->define_fields)
628 continue; 631 continue;
629 632
630 if (strcmp(call->system, system->name) != 0) 633 if (strcmp(call->class->system, system->name) != 0)
631 continue; 634 continue;
632 635
633 err = init_preds(call); 636 err = init_preds(call);
@@ -643,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
643 struct ftrace_event_call *call; 646 struct ftrace_event_call *call;
644 647
645 list_for_each_entry(call, &ftrace_events, list) { 648 list_for_each_entry(call, &ftrace_events, list) {
646 if (!call->define_fields) 649 if (!call->class || !call->class->define_fields)
647 continue; 650 continue;
648 651
649 if (strcmp(call->system, system->name) != 0) 652 if (strcmp(call->class->system, system->name) != 0)
650 continue; 653 continue;
651 654
652 filter_disable_preds(call); 655 filter_disable_preds(call);
@@ -1248,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
1248 list_for_each_entry(call, &ftrace_events, list) { 1251 list_for_each_entry(call, &ftrace_events, list) {
1249 struct event_filter *filter = call->filter; 1252 struct event_filter *filter = call->filter;
1250 1253
1251 if (!call->define_fields) 1254 if (!call->class || !call->class->define_fields)
1252 continue; 1255 continue;
1253 1256
1254 if (strcmp(call->system, system->name) != 0) 1257 if (strcmp(call->class->system, system->name) != 0)
1255 continue; 1258 continue;
1256 1259
1257 /* try to see if the filter can be applied */ 1260 /* try to see if the filter can be applied */
@@ -1265,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
1265 if (err) 1268 if (err)
1266 filter_disable_preds(call); 1269 filter_disable_preds(call);
1267 else { 1270 else {
1268 call->filter_active = 1; 1271 call->flags |= TRACE_EVENT_FL_FILTERED;
1269 replace_filter_string(filter, filter_string); 1272 replace_filter_string(filter, filter_string);
1270 } 1273 }
1271 fail = false; 1274 fail = false;
@@ -1314,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1314 if (err) 1317 if (err)
1315 append_filter_err(ps, call->filter); 1318 append_filter_err(ps, call->filter);
1316 else 1319 else
1317 call->filter_active = 1; 1320 call->flags |= TRACE_EVENT_FL_FILTERED;
1318out: 1321out:
1319 filter_opstack_clear(ps); 1322 filter_opstack_clear(ps);
1320 postfix_clear(ps); 1323 postfix_clear(ps);
@@ -1371,7 +1374,7 @@ out_unlock:
1371 return err; 1374 return err;
1372} 1375}
1373 1376
1374#ifdef CONFIG_EVENT_PROFILE 1377#ifdef CONFIG_PERF_EVENTS
1375 1378
1376void ftrace_profile_free_filter(struct perf_event *event) 1379void ftrace_profile_free_filter(struct perf_event *event)
1377{ 1380{
@@ -1392,12 +1395,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1392 mutex_lock(&event_mutex); 1395 mutex_lock(&event_mutex);
1393 1396
1394 list_for_each_entry(call, &ftrace_events, list) { 1397 list_for_each_entry(call, &ftrace_events, list) {
1395 if (call->id == event_id) 1398 if (call->event.type == event_id)
1396 break; 1399 break;
1397 } 1400 }
1398 1401
1399 err = -EINVAL; 1402 err = -EINVAL;
1400 if (!call) 1403 if (&call->list == &ftrace_events)
1401 goto out_unlock; 1404 goto out_unlock;
1402 1405
1403 err = -EEXIST; 1406 err = -EEXIST;
@@ -1439,5 +1442,5 @@ out_unlock:
1439 return err; 1442 return err;
1440} 1443}
1441 1444
1442#endif /* CONFIG_EVENT_PROFILE */ 1445#endif /* CONFIG_PERF_EVENTS */
1443 1446
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
175 return ret; 103 return ret;
176 104
177#undef __dynamic_array 105#undef __dynamic_array
178#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
179 112
180#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -194,10 +127,13 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
194 127
195static int ftrace_raw_init_event(struct ftrace_event_call *call) 128static int ftrace_raw_init_event(struct ftrace_event_call *call)
196{ 129{
197 INIT_LIST_HEAD(&call->fields); 130 INIT_LIST_HEAD(&call->class->fields);
198 return 0; 131 return 0;
199} 132}
200 133
134#undef __entry
135#define __entry REC
136
201#undef __field 137#undef __field
202#define __field(type, item) 138#define __field(type, item)
203 139
@@ -213,18 +149,25 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
213#undef __dynamic_array 149#undef __dynamic_array
214#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
215 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
216#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
157 \
158struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \
162}; \
218 \ 163 \
219struct ftrace_event_call __used \ 164struct ftrace_event_call __used \
220__attribute__((__aligned__(4))) \ 165__attribute__((__aligned__(4))) \
221__attribute__((section("_ftrace_events"))) event_##call = { \ 166__attribute__((section("_ftrace_events"))) event_##call = { \
222 .name = #call, \ 167 .name = #call, \
223 .id = type, \ 168 .event.type = etype, \
224 .system = __stringify(TRACE_SYSTEM), \ 169 .class = &event_class_ftrace_##call, \
225 .raw_init = ftrace_raw_init_event, \ 170 .print_fmt = print, \
226 .show_format = ftrace_format_##call, \
227 .define_fields = ftrace_define_fields_##call, \
228}; \ 171}; \
229 172
230#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14#include "trace.h" 15#include "trace.h"
@@ -18,6 +19,7 @@ struct fgraph_cpu_data {
18 pid_t last_pid; 19 pid_t last_pid;
19 int depth; 20 int depth;
20 int ignore; 21 int ignore;
22 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
21}; 23};
22 24
23struct fgraph_data { 25struct fgraph_data {
@@ -38,7 +40,7 @@ struct fgraph_data {
38#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
39#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
40#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
41#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
42 44
43static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
44 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -177,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
177 return ret; 179 return ret;
178} 180}
179 181
180static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
181 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
182 unsigned long flags, 184 unsigned long flags,
183 int pc) 185 int pc)
@@ -187,7 +189,7 @@ static int __trace_graph_entry(struct trace_array *tr,
187 struct ring_buffer *buffer = tr->buffer; 189 struct ring_buffer *buffer = tr->buffer;
188 struct ftrace_graph_ent_entry *entry; 190 struct ftrace_graph_ent_entry *entry;
189 191
190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 192 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
191 return 0; 193 return 0;
192 194
193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 195 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -212,13 +214,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
212 int cpu; 214 int cpu;
213 int pc; 215 int pc;
214 216
215 if (unlikely(!tr))
216 return 0;
217
218 if (!ftrace_trace_task(current)) 217 if (!ftrace_trace_task(current))
219 return 0; 218 return 0;
220 219
221 if (!ftrace_graph_addr(trace->func)) 220 /* trace it when it is-nested-in or is a function enabled. */
221 if (!(trace->depth || ftrace_graph_addr(trace->func)))
222 return 0; 222 return 0;
223 223
224 local_irq_save(flags); 224 local_irq_save(flags);
@@ -231,9 +231,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
231 } else { 231 } else {
232 ret = 0; 232 ret = 0;
233 } 233 }
234 /* Only do the atomic if it is not already set */
235 if (!test_tsk_trace_graph(current))
236 set_tsk_trace_graph(current);
237 234
238 atomic_dec(&data->disabled); 235 atomic_dec(&data->disabled);
239 local_irq_restore(flags); 236 local_irq_restore(flags);
@@ -241,7 +238,15 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
241 return ret; 238 return ret;
242} 239}
243 240
244static void __trace_graph_return(struct trace_array *tr, 241int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
242{
243 if (tracing_thresh)
244 return 1;
245 else
246 return trace_graph_entry(trace);
247}
248
249void __trace_graph_return(struct trace_array *tr,
245 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
246 unsigned long flags, 251 unsigned long flags,
247 int pc) 252 int pc)
@@ -251,7 +256,7 @@ static void __trace_graph_return(struct trace_array *tr,
251 struct ring_buffer *buffer = tr->buffer; 256 struct ring_buffer *buffer = tr->buffer;
252 struct ftrace_graph_ret_entry *entry; 257 struct ftrace_graph_ret_entry *entry;
253 258
254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) 259 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
255 return; 260 return;
256 261
257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 262 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -281,19 +286,39 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
281 pc = preempt_count(); 286 pc = preempt_count();
282 __trace_graph_return(tr, trace, flags, pc); 287 __trace_graph_return(tr, trace, flags, pc);
283 } 288 }
284 if (!trace->depth)
285 clear_tsk_trace_graph(current);
286 atomic_dec(&data->disabled); 289 atomic_dec(&data->disabled);
287 local_irq_restore(flags); 290 local_irq_restore(flags);
288} 291}
289 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296
297 /* Make graph_array visible before we start tracing */
298
299 smp_mb();
300}
301
302void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
303{
304 if (tracing_thresh &&
305 (trace->rettime - trace->calltime < tracing_thresh))
306 return;
307 else
308 trace_graph_return(trace);
309}
310
290static int graph_trace_init(struct trace_array *tr) 311static int graph_trace_init(struct trace_array *tr)
291{ 312{
292 int ret; 313 int ret;
293 314
294 graph_array = tr; 315 set_graph_array(tr);
295 ret = register_ftrace_graph(&trace_graph_return, 316 if (tracing_thresh)
296 &trace_graph_entry); 317 ret = register_ftrace_graph(&trace_graph_thresh_return,
318 &trace_graph_thresh_entry);
319 else
320 ret = register_ftrace_graph(&trace_graph_return,
321 &trace_graph_entry);
297 if (ret) 322 if (ret)
298 return ret; 323 return ret;
299 tracing_start_cmdline_record(); 324 tracing_start_cmdline_record();
@@ -301,11 +326,6 @@ static int graph_trace_init(struct trace_array *tr)
301 return 0; 326 return 0;
302} 327}
303 328
304void set_graph_array(struct trace_array *tr)
305{
306 graph_array = tr;
307}
308
309static void graph_trace_reset(struct trace_array *tr) 329static void graph_trace_reset(struct trace_array *tr)
310{ 330{
311 tracing_stop_cmdline_record(); 331 tracing_stop_cmdline_record();
@@ -470,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
470 * We need to consume the current entry to see 490 * We need to consume the current entry to see
471 * the next one. 491 * the next one.
472 */ 492 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL); 496 NULL, NULL);
476 } 497 }
477 498
478 if (!event) 499 if (!event)
@@ -506,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
506 527
507/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
508static int 529static int
509print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
510{ 532{
511 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
512 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
513 return 1; 535 return 1;
514 536
515 /* Non nested entry or return */ 537 /* Non nested entry or return */
516 if (duration == -1) 538 if (duration == -1)
517 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
518 540
519 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
520 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
521 if (duration > 100000ULL) 543 if (duration > 100000ULL)
522 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -542,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
542 564
543static enum print_line_t 565static enum print_line_t
544print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
545 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
546{ 568{
547 int ret; 569 int ret;
548 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -552,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
552 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
553 575
554 /* Absolute time */ 576 /* Absolute time */
555 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
556 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
557 if (!ret) 579 if (!ret)
558 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
559 } 581 }
560 582
561 /* Cpu */ 583 /* Cpu */
562 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
563 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
564 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
565 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
566 } 588 }
567 589
568 /* Proc */ 590 /* Proc */
569 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
570 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
571 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
572 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -576,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
576 } 598 }
577 599
578 /* No overhead */ 600 /* No overhead */
579 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
580 if (!ret) 602 if (!ret)
581 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
582 604
@@ -589,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
589 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
590 612
591 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
592 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
593 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
594 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
595 617
@@ -659,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
659static enum print_line_t 681static enum print_line_t
660print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
661 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
662 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
663{ 686{
664 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
665 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -673,24 +696,30 @@ print_graph_entry_leaf(struct trace_iterator *iter,
673 duration = graph_ret->rettime - graph_ret->calltime; 696 duration = graph_ret->rettime - graph_ret->calltime;
674 697
675 if (data) { 698 if (data) {
699 struct fgraph_cpu_data *cpu_data;
676 int cpu = iter->cpu; 700 int cpu = iter->cpu;
677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 701
702 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
678 703
679 /* 704 /*
680 * Comments display at + 1 to depth. Since 705 * Comments display at + 1 to depth. Since
681 * this is a leaf function, keep the comments 706 * this is a leaf function, keep the comments
682 * equal to this depth. 707 * equal to this depth.
683 */ 708 */
684 *depth = call->depth - 1; 709 cpu_data->depth = call->depth - 1;
710
711 /* No need to keep this function around for this depth */
712 if (call->depth < FTRACE_RETFUNC_DEPTH)
713 cpu_data->enter_funcs[call->depth] = 0;
685 } 714 }
686 715
687 /* Overhead */ 716 /* Overhead */
688 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
689 if (!ret) 718 if (!ret)
690 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
691 720
692 /* Duration */ 721 /* Duration */
693 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
694 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
695 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
696 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -713,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
713static enum print_line_t 742static enum print_line_t
714print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
715 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
716 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
717{ 746{
718 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
719 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -721,19 +750,24 @@ print_graph_entry_nested(struct trace_iterator *iter,
721 int i; 750 int i;
722 751
723 if (data) { 752 if (data) {
753 struct fgraph_cpu_data *cpu_data;
724 int cpu = iter->cpu; 754 int cpu = iter->cpu;
725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
726 755
727 *depth = call->depth; 756 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
757 cpu_data->depth = call->depth;
758
759 /* Save this function pointer to see if the exit matches */
760 if (call->depth < FTRACE_RETFUNC_DEPTH)
761 cpu_data->enter_funcs[call->depth] = call->func;
728 } 762 }
729 763
730 /* No overhead */ 764 /* No overhead */
731 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
732 if (!ret) 766 if (!ret)
733 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
734 768
735 /* No time */ 769 /* No time */
736 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
737 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
738 if (!ret) 772 if (!ret)
739 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -759,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 793
760static enum print_line_t 794static enum print_line_t
761print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
762 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
763{ 797{
764 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
765 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -772,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
772 806
773 if (type) { 807 if (type) {
774 /* Interrupt */ 808 /* Interrupt */
775 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
776 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
777 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
778 } 812 }
779 813
780 /* Absolute time */ 814 /* Absolute time */
781 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
782 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
783 if (!ret) 817 if (!ret)
784 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
785 } 819 }
786 820
787 /* Cpu */ 821 /* Cpu */
788 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
789 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
790 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
791 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
792 } 826 }
793 827
794 /* Proc */ 828 /* Proc */
795 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
796 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
797 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
798 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -814,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
814 848
815static enum print_line_t 849static enum print_line_t
816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
817 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
818{ 852{
819 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
820 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -822,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
822 static enum print_line_t ret; 856 static enum print_line_t ret;
823 int cpu = iter->cpu; 857 int cpu = iter->cpu;
824 858
825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
826 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
827 861
828 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
829 if (leaf_ret) 863 if (leaf_ret)
830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
831 else 865 else
832 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
833 867
834 if (data) { 868 if (data) {
835 /* 869 /*
@@ -848,37 +882,47 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 882
849static enum print_line_t 883static enum print_line_t
850print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
851 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
852{ 887{
853 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
854 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
855 pid_t pid = ent->pid; 890 pid_t pid = ent->pid;
856 int cpu = iter->cpu; 891 int cpu = iter->cpu;
892 int func_match = 1;
857 int ret; 893 int ret;
858 int i; 894 int i;
859 895
860 if (data) { 896 if (data) {
897 struct fgraph_cpu_data *cpu_data;
861 int cpu = iter->cpu; 898 int cpu = iter->cpu;
862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); 899
900 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
863 901
864 /* 902 /*
865 * Comments display at + 1 to depth. This is the 903 * Comments display at + 1 to depth. This is the
866 * return from a function, we now want the comments 904 * return from a function, we now want the comments
867 * to display at the same level of the bracket. 905 * to display at the same level of the bracket.
868 */ 906 */
869 *depth = trace->depth - 1; 907 cpu_data->depth = trace->depth - 1;
908
909 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
910 if (cpu_data->enter_funcs[trace->depth] != trace->func)
911 func_match = 0;
912 cpu_data->enter_funcs[trace->depth] = 0;
913 }
870 } 914 }
871 915
872 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
873 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
874 918
875 /* Overhead */ 919 /* Overhead */
876 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
877 if (!ret) 921 if (!ret)
878 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
879 923
880 /* Duration */ 924 /* Duration */
881 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
882 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
883 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
884 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -891,19 +935,32 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
891 return TRACE_TYPE_PARTIAL_LINE; 935 return TRACE_TYPE_PARTIAL_LINE;
892 } 936 }
893 937
894 ret = trace_seq_printf(s, "}\n"); 938 /*
895 if (!ret) 939 * If the return function does not have a matching entry,
896 return TRACE_TYPE_PARTIAL_LINE; 940 * then the entry was lost. Instead of just printing
941 * the '}' and letting the user guess what function this
942 * belongs to, write out the function name.
943 */
944 if (func_match) {
945 ret = trace_seq_printf(s, "}\n");
946 if (!ret)
947 return TRACE_TYPE_PARTIAL_LINE;
948 } else {
949 ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
950 if (!ret)
951 return TRACE_TYPE_PARTIAL_LINE;
952 }
897 953
898 /* Overrun */ 954 /* Overrun */
899 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
900 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
901 trace->overrun); 957 trace->overrun);
902 if (!ret) 958 if (!ret)
903 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
904 } 960 }
905 961
906 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
907 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
908 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
909 966
@@ -911,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
911} 968}
912 969
913static enum print_line_t 970static enum print_line_t
914print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
915 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
916{ 973{
917 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
918 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -924,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
924 if (data) 981 if (data)
925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
926 983
927 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
928 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
929 986
930 /* No overhead */ 987 /* No overhead */
931 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
932 if (!ret) 989 if (!ret)
933 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
934 991
935 /* No time */ 992 /* No time */
936 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
937 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
938 if (!ret) 995 if (!ret)
939 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -968,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
968 if (!event) 1025 if (!event)
969 return TRACE_TYPE_UNHANDLED; 1026 return TRACE_TYPE_UNHANDLED;
970 1027
971 ret = event->trace(iter, sym_flags); 1028 ret = event->funcs->trace(iter, sym_flags, event);
972 if (ret != TRACE_TYPE_HANDLED) 1029 if (ret != TRACE_TYPE_HANDLED)
973 return ret; 1030 return ret;
974 } 1031 }
@@ -988,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
988 1045
989 1046
990enum print_line_t 1047enum print_line_t
991print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
992{ 1049{
993 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1009,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1009 if (data && data->failed) { 1066 if (data && data->failed) {
1010 field = &data->ent; 1067 field = &data->ent;
1011 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1029,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
1029 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1030 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1031 saved = *field; 1088 saved = *field;
1032 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1033 } 1090 }
1034 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1035 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1036 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1037 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1038 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1039 default: 1101 default:
1040 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1041 } 1103 }
1042 1104
1043 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1044} 1106}
1045 1107
1046static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags,
1116 struct trace_event *event)
1117{
1118 return print_graph_function(iter);
1119}
1120
1121static void print_lat_header(struct seq_file *s, u32 flags)
1047{ 1122{
1048 static const char spaces[] = " " /* 16 spaces */ 1123 static const char spaces[] = " " /* 16 spaces */
1049 " " /* 4 spaces */ 1124 " " /* 4 spaces */
1050 " "; /* 17 spaces */ 1125 " "; /* 17 spaces */
1051 int size = 0; 1126 int size = 0;
1052 1127
1053 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1128 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1054 size += 16; 1129 size += 16;
1055 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1130 if (flags & TRACE_GRAPH_PRINT_CPU)
1056 size += 4; 1131 size += 4;
1057 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1132 if (flags & TRACE_GRAPH_PRINT_PROC)
1058 size += 17; 1133 size += 17;
1059 1134
1060 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1135 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1065,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
1065 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1140 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1066} 1141}
1067 1142
1068static void print_graph_headers(struct seq_file *s) 1143void print_graph_headers_flags(struct seq_file *s, u32 flags)
1069{ 1144{
1070 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1145 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1071 1146
1072 if (lat) 1147 if (lat)
1073 print_lat_header(s); 1148 print_lat_header(s, flags);
1074 1149
1075 /* 1st line */ 1150 /* 1st line */
1076 seq_printf(s, "#"); 1151 seq_printf(s, "#");
1077 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1152 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1078 seq_printf(s, " TIME "); 1153 seq_printf(s, " TIME ");
1079 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1154 if (flags & TRACE_GRAPH_PRINT_CPU)
1080 seq_printf(s, " CPU"); 1155 seq_printf(s, " CPU");
1081 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1156 if (flags & TRACE_GRAPH_PRINT_PROC)
1082 seq_printf(s, " TASK/PID "); 1157 seq_printf(s, " TASK/PID ");
1083 if (lat) 1158 if (lat)
1084 seq_printf(s, "|||||"); 1159 seq_printf(s, "|||||");
1085 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1160 if (flags & TRACE_GRAPH_PRINT_DURATION)
1086 seq_printf(s, " DURATION "); 1161 seq_printf(s, " DURATION ");
1087 seq_printf(s, " FUNCTION CALLS\n"); 1162 seq_printf(s, " FUNCTION CALLS\n");
1088 1163
1089 /* 2nd line */ 1164 /* 2nd line */
1090 seq_printf(s, "#"); 1165 seq_printf(s, "#");
1091 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1166 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1092 seq_printf(s, " | "); 1167 seq_printf(s, " | ");
1093 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1168 if (flags & TRACE_GRAPH_PRINT_CPU)
1094 seq_printf(s, " | "); 1169 seq_printf(s, " | ");
1095 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1170 if (flags & TRACE_GRAPH_PRINT_PROC)
1096 seq_printf(s, " | | "); 1171 seq_printf(s, " | | ");
1097 if (lat) 1172 if (lat)
1098 seq_printf(s, "|||||"); 1173 seq_printf(s, "|||||");
1099 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1174 if (flags & TRACE_GRAPH_PRINT_DURATION)
1100 seq_printf(s, " | | "); 1175 seq_printf(s, " | | ");
1101 seq_printf(s, " | | | |\n"); 1176 seq_printf(s, " | | | |\n");
1102} 1177}
1103 1178
1104static void graph_trace_open(struct trace_iterator *iter) 1179void print_graph_headers(struct seq_file *s)
1180{
1181 print_graph_headers_flags(s, tracer_flags.val);
1182}
1183
1184void graph_trace_open(struct trace_iterator *iter)
1105{ 1185{
1106 /* pid and depth on the last trace processed */ 1186 /* pid and depth on the last trace processed */
1107 struct fgraph_data *data; 1187 struct fgraph_data *data;
@@ -1136,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1136 pr_warning("function graph tracer: not enough memory\n"); 1216 pr_warning("function graph tracer: not enough memory\n");
1137} 1217}
1138 1218
1139static void graph_trace_close(struct trace_iterator *iter) 1219void graph_trace_close(struct trace_iterator *iter)
1140{ 1220{
1141 struct fgraph_data *data = iter->private; 1221 struct fgraph_data *data = iter->private;
1142 1222
@@ -1146,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
1146 } 1226 }
1147} 1227}
1148 1228
1229static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_entry_event = {
1234 .type = TRACE_GRAPH_ENT,
1235 .funcs = &graph_functions,
1236};
1237
1238static struct trace_event graph_trace_ret_event = {
1239 .type = TRACE_GRAPH_RET,
1240 .funcs = &graph_functions
1241};
1242
1149static struct tracer graph_trace __read_mostly = { 1243static struct tracer graph_trace __read_mostly = {
1150 .name = "function_graph", 1244 .name = "function_graph",
1151 .open = graph_trace_open, 1245 .open = graph_trace_open,
@@ -1167,6 +1261,16 @@ static __init int init_graph_trace(void)
1167{ 1261{
1168 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1262 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1169 1263
1264 if (!register_ftrace_event(&graph_trace_entry_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1269 if (!register_ftrace_event(&graph_trace_ret_event)) {
1270 pr_warning("Warning: could not register graph trace events\n");
1271 return 1;
1272 }
1273
1170 return register_tracer(&graph_trace); 1274 return register_tracer(&graph_trace);
1171} 1275}
1172 1276
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c96..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,61 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{ 93{
81 return regs_get_kernel_stack_nth(regs, 94 return fprm->fn(regs, fprm->data, dest);
82 (unsigned int)((unsigned long)num));
83} 95}
84 96
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
86{ 98/*
87 unsigned long retval; 99 * Define macro for basic types - we don't need to define s* types, because
88 100 * we have to care only about bitwidth at recording time.
89 if (probe_kernel_address(addr, retval)) 101 */
90 return 0; 102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
91 return retval; 103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
92} 121}
93 122DEFINE_BASIC_FETCH_FUNCS(reg)
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) 123
95{ 124#define DEFINE_FETCH_stack(type) \
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
126 void *offset, void *dest) \
127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
97} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
98 132
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 133#define DEFINE_FETCH_retval(type) \
100 void *dummy) 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
101{ 135 void *dummy, void *dest) \
102 return regs_return_value(regs); 136{ \
137 *(type *)dest = (type)regs_return_value(regs); \
103} 138}
104 139DEFINE_BASIC_FETCH_FUNCS(retval)
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 140
106 void *dummy) 141#define DEFINE_FETCH_memory(type) \
107{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
108 return kernel_stack_pointer(regs); 143 void *addr, void *dest) \
144{ \
145 type retval; \
146 if (probe_kernel_address(addr, retval)) \
147 *(type *)dest = 0; \
148 else \
149 *(type *)dest = retval; \
109} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
110 152
111/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
112struct symbol_cache { 154struct symbol_cache {
@@ -150,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
150 return sc; 192 return sc;
151} 193}
152 194
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
154{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
155 struct symbol_cache *sc = data; 197 void *data, void *dest) \
156 198{ \
157 if (sc->addr) 199 struct symbol_cache *sc = data; \
158 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
159 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
160 return 0; 202 else \
203 *(type *)dest = 0; \
161} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
162 206
163/* Special indirect memory access interface */ 207/* Dereference memory access function */
164struct indirect_fetch_data { 208struct deref_fetch_param {
165 struct fetch_func orig; 209 struct fetch_param orig;
166 long offset; 210 long offset;
167}; 211};
168 212
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
170{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
171 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
172 unsigned long addr; 216{ \
173 217 struct deref_fetch_param *dprm = data; \
174 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
175 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
176 addr += ind->offset; 220 if (addr) { \
177 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
178 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
179 return 0; 223 } else \
224 *(type *)dest = 0; \
180} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
181 227
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
183{ 229{
184 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
185 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
186 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
187 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
188 kfree(data); 234 kfree(data);
189} 235}
190 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
191/** 305/**
192 * Kprobe event core functions 306 * Kprobe event core functions
193 */ 307 */
194 308
195struct probe_arg { 309struct probe_arg {
196 struct fetch_func fetch; 310 struct fetch_param fetch;
197 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
198}; 315};
199 316
200/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -207,8 +324,9 @@ struct trace_probe {
207 unsigned long nhit; 324 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */ 325 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
327 struct ftrace_event_class class;
210 struct ftrace_event_call call; 328 struct ftrace_event_call call;
211 struct trace_event event; 329 ssize_t size; /* trace entry size */
212 unsigned int nr_args; 330 unsigned int nr_args;
213 struct probe_arg args[]; 331 struct probe_arg args[];
214}; 332};
@@ -217,6 +335,7 @@ struct trace_probe {
217 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
219 337
338
220static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
221{ 340{
222 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -227,51 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
227 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
228} 347}
229 348
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 if (sc->offset)
247 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
248 sc->offset);
249 else
250 ret = snprintf(buf, n, "@%s", sc->symbol);
251 } else if (ff->func == fetch_retvalue)
252 ret = snprintf(buf, n, "$retval");
253 else if (ff->func == fetch_stack_address)
254 ret = snprintf(buf, n, "$stack");
255 else if (ff->func == fetch_indirect) {
256 struct indirect_fetch_data *id = ff->data;
257 size_t l = 0;
258 ret = snprintf(buf, n, "%+ld(", id->offset);
259 if (ret >= n)
260 goto end;
261 l += ret;
262 ret = probe_arg_string(buf + l, n - l, &id->orig);
263 if (ret < 0)
264 goto end;
265 l += ret;
266 ret = snprintf(buf + l, n - l, ")");
267 ret += l;
268 }
269end:
270 if (ret >= n)
271 return -ENOSPC;
272 return ret;
273}
274
275static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
276static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
277 351
@@ -330,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
330 goto error; 404 goto error;
331 } 405 }
332 406
407 tp->call.class = &tp->class;
333 tp->call.name = kstrdup(event, GFP_KERNEL); 408 tp->call.name = kstrdup(event, GFP_KERNEL);
334 if (!tp->call.name) 409 if (!tp->call.name)
335 goto error; 410 goto error;
@@ -339,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
339 goto error; 414 goto error;
340 } 415 }
341 416
342 tp->call.system = kstrdup(group, GFP_KERNEL); 417 tp->class.system = kstrdup(group, GFP_KERNEL);
343 if (!tp->call.system) 418 if (!tp->class.system)
344 goto error; 419 goto error;
345 420
346 INIT_LIST_HEAD(&tp->list); 421 INIT_LIST_HEAD(&tp->list);
@@ -354,11 +429,12 @@ error:
354 429
355static void free_probe_arg(struct probe_arg *arg) 430static void free_probe_arg(struct probe_arg *arg)
356{ 431{
357 if (arg->fetch.func == fetch_symbol) 432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
358 free_symbol_cache(arg->fetch.data); 435 free_symbol_cache(arg->fetch.data);
359 else if (arg->fetch.func == fetch_indirect)
360 free_indirect_fetch_data(arg->fetch.data);
361 kfree(arg->name); 436 kfree(arg->name);
437 kfree(arg->comm);
362} 438}
363 439
364static void free_trace_probe(struct trace_probe *tp) 440static void free_trace_probe(struct trace_probe *tp)
@@ -368,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
368 for (i = 0; i < tp->nr_args; i++) 444 for (i = 0; i < tp->nr_args; i++)
369 free_probe_arg(&tp->args[i]); 445 free_probe_arg(&tp->args[i]);
370 446
371 kfree(tp->call.system); 447 kfree(tp->call.class->system);
372 kfree(tp->call.name); 448 kfree(tp->call.name);
373 kfree(tp->symbol); 449 kfree(tp->symbol);
374 kfree(tp); 450 kfree(tp);
@@ -381,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
381 457
382 list_for_each_entry(tp, &probe_list, list) 458 list_for_each_entry(tp, &probe_list, list)
383 if (strcmp(tp->call.name, event) == 0 && 459 if (strcmp(tp->call.name, event) == 0 &&
384 strcmp(tp->call.system, group) == 0) 460 strcmp(tp->call.class->system, group) == 0)
385 return tp; 461 return tp;
386 return NULL; 462 return NULL;
387} 463}
@@ -406,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
406 mutex_lock(&probe_lock); 482 mutex_lock(&probe_lock);
407 483
408 /* register as an event */ 484 /* register as an event */
409 old_tp = find_probe_event(tp->call.name, tp->call.system); 485 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
410 if (old_tp) { 486 if (old_tp) {
411 /* delete old event */ 487 /* delete old event */
412 unregister_trace_probe(old_tp); 488 unregister_trace_probe(old_tp);
@@ -464,46 +540,41 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
464#define PARAM_MAX_ARGS 16 540#define PARAM_MAX_ARGS 16
465#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 541#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
466 542
467static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 543static int parse_probe_vars(char *arg, const struct fetch_type *t,
544 struct fetch_param *f, int is_return)
468{ 545{
469 int ret = 0; 546 int ret = 0;
470 unsigned long param; 547 unsigned long param;
471 548
472 if (strcmp(arg, "retval") == 0) { 549 if (strcmp(arg, "retval") == 0) {
473 if (is_return) { 550 if (is_return)
474 ff->func = fetch_retvalue; 551 f->fn = t->retval;
475 ff->data = NULL; 552 else
476 } else
477 ret = -EINVAL; 553 ret = -EINVAL;
478 } else if (strncmp(arg, "stack", 5) == 0) { 554 } else if (strncmp(arg, "stack", 5) == 0) {
479 if (arg[5] == '\0') { 555 if (arg[5] == '\0') {
480 ff->func = fetch_stack_address; 556 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
481 ff->data = NULL; 557 f->fn = fetch_stack_address;
558 else
559 ret = -EINVAL;
482 } else if (isdigit(arg[5])) { 560 } else if (isdigit(arg[5])) {
483 ret = strict_strtoul(arg + 5, 10, &param); 561 ret = strict_strtoul(arg + 5, 10, &param);
484 if (ret || param > PARAM_MAX_STACK) 562 if (ret || param > PARAM_MAX_STACK)
485 ret = -EINVAL; 563 ret = -EINVAL;
486 else { 564 else {
487 ff->func = fetch_stack; 565 f->fn = t->stack;
488 ff->data = (void *)param; 566 f->data = (void *)param;
489 } 567 }
490 } else 568 } else
491 ret = -EINVAL; 569 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 570 } else
501 ret = -EINVAL; 571 ret = -EINVAL;
502 return ret; 572 return ret;
503} 573}
504 574
505/* Recursive argument parser */ 575/* Recursive argument parser */
506static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 576static int __parse_probe_arg(char *arg, const struct fetch_type *t,
577 struct fetch_param *f, int is_return)
507{ 578{
508 int ret = 0; 579 int ret = 0;
509 unsigned long param; 580 unsigned long param;
@@ -512,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 583
513 switch (arg[0]) { 584 switch (arg[0]) {
514 case '$': 585 case '$':
515 ret = parse_probe_vars(arg + 1, ff, is_return); 586 ret = parse_probe_vars(arg + 1, t, f, is_return);
516 break; 587 break;
517 case '%': /* named register */ 588 case '%': /* named register */
518 ret = regs_query_register_offset(arg + 1); 589 ret = regs_query_register_offset(arg + 1);
519 if (ret >= 0) { 590 if (ret >= 0) {
520 ff->func = fetch_register; 591 f->fn = t->reg;
521 ff->data = (void *)(unsigned long)ret; 592 f->data = (void *)(unsigned long)ret;
522 ret = 0; 593 ret = 0;
523 } 594 }
524 break; 595 break;
@@ -527,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
527 ret = strict_strtoul(arg + 1, 0, &param); 598 ret = strict_strtoul(arg + 1, 0, &param);
528 if (ret) 599 if (ret)
529 break; 600 break;
530 ff->func = fetch_memory; 601 f->fn = t->memory;
531 ff->data = (void *)param; 602 f->data = (void *)param;
532 } else { 603 } else {
533 ret = split_symbol_offset(arg + 1, &offset); 604 ret = split_symbol_offset(arg + 1, &offset);
534 if (ret) 605 if (ret)
535 break; 606 break;
536 ff->data = alloc_symbol_cache(arg + 1, offset); 607 f->data = alloc_symbol_cache(arg + 1, offset);
537 if (ff->data) 608 if (f->data)
538 ff->func = fetch_symbol; 609 f->fn = t->symbol;
539 else
540 ret = -EINVAL;
541 } 610 }
542 break; 611 break;
543 case '+': /* indirect memory */ 612 case '+': /* deref memory */
544 case '-': 613 case '-':
545 tmp = strchr(arg, '('); 614 tmp = strchr(arg, '(');
546 if (!tmp) { 615 if (!tmp)
547 ret = -EINVAL;
548 break; 616 break;
549 }
550 *tmp = '\0'; 617 *tmp = '\0';
551 ret = strict_strtol(arg + 1, 0, &offset); 618 ret = strict_strtol(arg + 1, 0, &offset);
552 if (ret) 619 if (ret)
@@ -556,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
556 arg = tmp + 1; 623 arg = tmp + 1;
557 tmp = strrchr(arg, ')'); 624 tmp = strrchr(arg, ')');
558 if (tmp) { 625 if (tmp) {
559 struct indirect_fetch_data *id; 626 struct deref_fetch_param *dprm;
627 const struct fetch_type *t2 = find_fetch_type(NULL);
560 *tmp = '\0'; 628 *tmp = '\0';
561 id = kzalloc(sizeof(struct indirect_fetch_data), 629 dprm = kzalloc(sizeof(struct deref_fetch_param),
562 GFP_KERNEL); 630 GFP_KERNEL);
563 if (!id) 631 if (!dprm)
564 return -ENOMEM; 632 return -ENOMEM;
565 id->offset = offset; 633 dprm->offset = offset;
566 ret = __parse_probe_arg(arg, &id->orig, is_return); 634 ret = __parse_probe_arg(arg, t2, &dprm->orig,
635 is_return);
567 if (ret) 636 if (ret)
568 kfree(id); 637 kfree(dprm);
569 else { 638 else {
570 ff->func = fetch_indirect; 639 f->fn = t->deref;
571 ff->data = (void *)id; 640 f->data = (void *)dprm;
572 } 641 }
573 } else 642 }
574 ret = -EINVAL;
575 break; 643 break;
576 default:
577 /* TODO: support custom handler */
578 ret = -EINVAL;
579 } 644 }
645 if (!ret && !f->fn)
646 ret = -EINVAL;
580 return ret; 647 return ret;
581} 648}
582 649
583/* String length checking wrapper */ 650/* String length checking wrapper */
584static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 651static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return)
585{ 653{
654 const char *t;
655
586 if (strlen(arg) > MAX_ARGSTR_LEN) { 656 if (strlen(arg) > MAX_ARGSTR_LEN) {
587 pr_info("Argument is too long.: %s\n", arg); 657 pr_info("Argument is too long.: %s\n", arg);
588 return -ENOSPC; 658 return -ENOSPC;
589 } 659 }
590 return __parse_probe_arg(arg, ff, is_return); 660 parg->comm = kstrdup(arg, GFP_KERNEL);
661 if (!parg->comm) {
662 pr_info("Failed to allocate memory for command '%s'.\n", arg);
663 return -ENOMEM;
664 }
665 t = strchr(parg->comm, ':');
666 if (t) {
667 arg[t - parg->comm] = '\0';
668 t++;
669 }
670 parg->type = find_fetch_type(t);
671 if (!parg->type) {
672 pr_info("Unsupported type: %s\n", t);
673 return -EINVAL;
674 }
675 parg->offset = tp->size;
676 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
591} 678}
592 679
593/* Return 1 if name is reserved or already used by another argument */ 680/* Return 1 if name is reserved or already used by another argument */
@@ -611,22 +698,24 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 698 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 699 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 700 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 701 * $retval : fetch return value
616 * $stack : fetch stack address 702 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 703 * $stackN : fetch Nth of stack (N:0-)
618 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 704 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
619 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 705 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
620 * %REG : fetch register REG 706 * %REG : fetch register REG
621 * Indirect memory fetch: 707 * Dereferencing memory fetch:
622 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 708 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
623 * Alias name of args: 709 * Alias name of args:
624 * NAME=FETCHARG : set NAME as alias of FETCHARG. 710 * NAME=FETCHARG : set NAME as alias of FETCHARG.
711 * Type of args:
712 * FETCHARG:TYPE : use TYPE instead of unsigned long.
625 */ 713 */
626 struct trace_probe *tp; 714 struct trace_probe *tp;
627 int i, ret = 0; 715 int i, ret = 0;
628 int is_return = 0, is_delete = 0; 716 int is_return = 0, is_delete = 0;
629 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 717 char *symbol = NULL, *event = NULL, *group = NULL;
718 char *arg, *tmp;
630 unsigned long offset = 0; 719 unsigned long offset = 0;
631 void *addr = NULL; 720 void *addr = NULL;
632 char buf[MAX_EVENT_NAME_LEN]; 721 char buf[MAX_EVENT_NAME_LEN];
@@ -651,12 +740,12 @@ static int create_trace_probe(int argc, char **argv)
651 event = strchr(group, '/') + 1; 740 event = strchr(group, '/') + 1;
652 event[-1] = '\0'; 741 event[-1] = '\0';
653 if (strlen(group) == 0) { 742 if (strlen(group) == 0) {
654 pr_info("Group name is not specifiled\n"); 743 pr_info("Group name is not specified\n");
655 return -EINVAL; 744 return -EINVAL;
656 } 745 }
657 } 746 }
658 if (strlen(event) == 0) { 747 if (strlen(event) == 0) {
659 pr_info("Event name is not specifiled\n"); 748 pr_info("Event name is not specified\n");
660 return -EINVAL; 749 return -EINVAL;
661 } 750 }
662 } 751 }
@@ -689,7 +778,7 @@ static int create_trace_probe(int argc, char **argv)
689 return -EINVAL; 778 return -EINVAL;
690 } 779 }
691 /* an address specified */ 780 /* an address specified */
692 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 781 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
693 if (ret) { 782 if (ret) {
694 pr_info("Failed to parse address.\n"); 783 pr_info("Failed to parse address.\n");
695 return ret; 784 return ret;
@@ -739,13 +828,6 @@ static int create_trace_probe(int argc, char **argv)
739 else 828 else
740 arg = argv[i]; 829 arg = argv[i];
741 830
742 if (conflict_field_name(argv[i], tp->args, i)) {
743 pr_info("Argument%d name '%s' conflicts with "
744 "another field.\n", i, argv[i]);
745 ret = -EINVAL;
746 goto error;
747 }
748
749 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 831 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
750 if (!tp->args[i].name) { 832 if (!tp->args[i].name) {
751 pr_info("Failed to allocate argument%d name '%s'.\n", 833 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -753,9 +835,19 @@ static int create_trace_probe(int argc, char **argv)
753 ret = -ENOMEM; 835 ret = -ENOMEM;
754 goto error; 836 goto error;
755 } 837 }
838 tmp = strchr(tp->args[i].name, ':');
839 if (tmp)
840 *tmp = '_'; /* convert : to _ */
841
842 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
843 pr_info("Argument%d name '%s' conflicts with "
844 "another field.\n", i, argv[i]);
845 ret = -EINVAL;
846 goto error;
847 }
756 848
757 /* Parse fetch argument */ 849 /* Parse fetch argument */
758 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 850 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
759 if (ret) { 851 if (ret) {
760 pr_info("Parse error at argument%d. (%d)\n", i, ret); 852 pr_info("Parse error at argument%d. (%d)\n", i, ret);
761 kfree(tp->args[i].name); 853 kfree(tp->args[i].name);
@@ -810,11 +902,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
810static int probes_seq_show(struct seq_file *m, void *v) 902static int probes_seq_show(struct seq_file *m, void *v)
811{ 903{
812 struct trace_probe *tp = v; 904 struct trace_probe *tp = v;
813 int i, ret; 905 int i;
814 char buf[MAX_ARGSTR_LEN + 1];
815 906
816 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 907 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
817 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 908 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
818 909
819 if (!tp->symbol) 910 if (!tp->symbol)
820 seq_printf(m, " 0x%p", tp->rp.kp.addr); 911 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -823,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
823 else 914 else
824 seq_printf(m, " %s", probe_symbol(tp)); 915 seq_printf(m, " %s", probe_symbol(tp));
825 916
826 for (i = 0; i < tp->nr_args; i++) { 917 for (i = 0; i < tp->nr_args; i++)
827 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 918 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
828 if (ret < 0) {
829 pr_warning("Argument%d decoding error(%d).\n", i, ret);
830 return ret;
831 }
832 seq_printf(m, " %s=%s", tp->args[i].name, buf);
833 }
834 seq_printf(m, "\n"); 919 seq_printf(m, "\n");
920
835 return 0; 921 return 0;
836} 922}
837 923
@@ -958,12 +1044,13 @@ static const struct file_operations kprobe_profile_ops = {
958}; 1044};
959 1045
960/* Kprobe handler */ 1046/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 1048{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1049 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 1050 struct kprobe_trace_entry_head *entry;
965 struct ring_buffer_event *event; 1051 struct ring_buffer_event *event;
966 struct ring_buffer *buffer; 1052 struct ring_buffer *buffer;
1053 u8 *data;
967 int size, i, pc; 1054 int size, i, pc;
968 unsigned long irq_flags; 1055 unsigned long irq_flags;
969 struct ftrace_event_call *call = &tp->call; 1056 struct ftrace_event_call *call = &tp->call;
@@ -973,32 +1060,32 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
973 local_save_flags(irq_flags); 1060 local_save_flags(irq_flags);
974 pc = preempt_count(); 1061 pc = preempt_count();
975 1062
976 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1063 size = sizeof(*entry) + tp->size;
977 1064
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
979 irq_flags, pc); 1066 size, irq_flags, pc);
980 if (!event) 1067 if (!event)
981 return 0; 1068 return;
982 1069
983 entry = ring_buffer_event_data(event); 1070 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args;
985 entry->ip = (unsigned long)kp->addr; 1071 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1];
986 for (i = 0; i < tp->nr_args; i++) 1073 for (i = 0; i < tp->nr_args; i++)
987 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
988 1075
989 if (!filter_current_check_discard(buffer, call, entry, event)) 1076 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 1078}
993 1079
994/* Kretprobe handler */ 1080/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 1081static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 1082 struct pt_regs *regs)
997{ 1083{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1084 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
999 struct kretprobe_trace_entry *entry; 1085 struct kretprobe_trace_entry_head *entry;
1000 struct ring_buffer_event *event; 1086 struct ring_buffer_event *event;
1001 struct ring_buffer *buffer; 1087 struct ring_buffer *buffer;
1088 u8 *data;
1002 int size, i, pc; 1089 int size, i, pc;
1003 unsigned long irq_flags; 1090 unsigned long irq_flags;
1004 struct ftrace_event_call *call = &tp->call; 1091 struct ftrace_event_call *call = &tp->call;
@@ -1006,39 +1093,37 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1006 local_save_flags(irq_flags); 1093 local_save_flags(irq_flags);
1007 pc = preempt_count(); 1094 pc = preempt_count();
1008 1095
1009 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1096 size = sizeof(*entry) + tp->size;
1010 1097
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
1012 irq_flags, pc); 1099 size, irq_flags, pc);
1013 if (!event) 1100 if (!event)
1014 return 0; 1101 return;
1015 1102
1016 entry = ring_buffer_event_data(event); 1103 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args;
1018 entry->func = (unsigned long)tp->rp.kp.addr; 1104 entry->func = (unsigned long)tp->rp.kp.addr;
1019 entry->ret_ip = (unsigned long)ri->ret_addr; 1105 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1];
1020 for (i = 0; i < tp->nr_args; i++) 1107 for (i = 0; i < tp->nr_args; i++)
1021 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1022 1109
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1110 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1112}
1028 1113
1029/* Event entry printers */ 1114/* Event entry printers */
1030enum print_line_t 1115enum print_line_t
1031print_kprobe_event(struct trace_iterator *iter, int flags) 1116print_kprobe_event(struct trace_iterator *iter, int flags,
1117 struct trace_event *event)
1032{ 1118{
1033 struct kprobe_trace_entry *field; 1119 struct kprobe_trace_entry_head *field;
1034 struct trace_seq *s = &iter->seq; 1120 struct trace_seq *s = &iter->seq;
1035 struct trace_event *event;
1036 struct trace_probe *tp; 1121 struct trace_probe *tp;
1122 u8 *data;
1037 int i; 1123 int i;
1038 1124
1039 field = (struct kprobe_trace_entry *)iter->ent; 1125 field = (struct kprobe_trace_entry_head *)iter->ent;
1040 event = ftrace_find_event(field->ent.type); 1126 tp = container_of(event, struct trace_probe, call.event);
1041 tp = container_of(event, struct trace_probe, event);
1042 1127
1043 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1128 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1044 goto partial; 1129 goto partial;
@@ -1049,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1049 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1050 goto partial; 1135 goto partial;
1051 1136
1052 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1053 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1054 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1055 goto partial; 1141 goto partial;
1056 1142
1057 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1063,17 +1149,17 @@ partial:
1063} 1149}
1064 1150
1065enum print_line_t 1151enum print_line_t
1066print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags,
1153 struct trace_event *event)
1067{ 1154{
1068 struct kretprobe_trace_entry *field; 1155 struct kretprobe_trace_entry_head *field;
1069 struct trace_seq *s = &iter->seq; 1156 struct trace_seq *s = &iter->seq;
1070 struct trace_event *event;
1071 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1072 int i; 1159 int i;
1073 1160
1074 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1075 event = ftrace_find_event(field->ent.type); 1162 tp = container_of(event, struct trace_probe, call.event);
1076 tp = container_of(event, struct trace_probe, event);
1077 1163
1078 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1164 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1079 goto partial; 1165 goto partial;
@@ -1090,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1090 if (!trace_seq_puts(s, ")")) 1176 if (!trace_seq_puts(s, ")"))
1091 goto partial; 1177 goto partial;
1092 1178
1093 for (i = 0; i < field->nargs; i++) 1179 data = (u8 *)&field[1];
1094 if (!trace_seq_printf(s, " %s=%lx", 1180 for (i = 0; i < tp->nr_args; i++)
1095 tp->args[i].name, field->args[i])) 1181 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset))
1096 goto partial; 1183 goto partial;
1097 1184
1098 if (!trace_seq_puts(s, "\n")) 1185 if (!trace_seq_puts(s, "\n"))
@@ -1129,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1129 1216
1130static int probe_event_raw_init(struct ftrace_event_call *event_call) 1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1131{ 1218{
1132 INIT_LIST_HEAD(&event_call->fields);
1133
1134 return 0; 1219 return 0;
1135} 1220}
1136 1221
@@ -1148,242 +1233,170 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1148static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1233static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1149{ 1234{
1150 int ret, i; 1235 int ret, i;
1151 struct kprobe_trace_entry field; 1236 struct kprobe_trace_entry_head field;
1152 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1237 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1153 1238
1154 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1155 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1156 /* Set argument names as fields */ 1240 /* Set argument names as fields */
1157 for (i = 0; i < tp->nr_args; i++) 1241 for (i = 0; i < tp->nr_args; i++) {
1158 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1242 ret = trace_define_field(event_call, tp->args[i].type->name,
1243 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size,
1246 tp->args[i].type->is_signed,
1247 FILTER_OTHER);
1248 if (ret)
1249 return ret;
1250 }
1159 return 0; 1251 return 0;
1160} 1252}
1161 1253
1162static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1254static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1163{ 1255{
1164 int ret, i; 1256 int ret, i;
1165 struct kretprobe_trace_entry field; 1257 struct kretprobe_trace_entry_head field;
1166 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1258 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1167 1259
1168 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1260 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1169 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1170 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1171 /* Set argument names as fields */ 1262 /* Set argument names as fields */
1172 for (i = 0; i < tp->nr_args; i++) 1263 for (i = 0; i < tp->nr_args; i++) {
1173 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1264 ret = trace_define_field(event_call, tp->args[i].type->name,
1265 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size,
1268 tp->args[i].type->is_signed,
1269 FILTER_OTHER);
1270 if (ret)
1271 return ret;
1272 }
1174 return 0; 1273 return 0;
1175} 1274}
1176 1275
1177static int __probe_event_show_format(struct trace_seq *s, 1276static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1178 struct trace_probe *tp, const char *fmt,
1179 const char *arg)
1180{ 1277{
1181 int i; 1278 int i;
1279 int pos = 0;
1182 1280
1183 /* Show format */ 1281 const char *fmt, *arg;
1184 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1185 return 0;
1186 1282
1187 for (i = 0; i < tp->nr_args; i++) 1283 if (!probe_is_return(tp)) {
1188 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1284 fmt = "(%lx)";
1189 return 0; 1285 arg = "REC->" FIELD_STRING_IP;
1286 } else {
1287 fmt = "(%lx <- %lx)";
1288 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1289 }
1190 1290
1191 if (!trace_seq_printf(s, "\", %s", arg)) 1291 /* When len=0, we just calculate the needed length */
1192 return 0; 1292#define LEN_OR_ZERO (len ? len - pos : 0)
1193 1293
1194 for (i = 0; i < tp->nr_args; i++) 1294 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1195 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1196 return 0;
1197 1295
1198 return trace_seq_puts(s, "\n"); 1296 for (i = 0; i < tp->nr_args; i++) {
1199} 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1298 tp->args[i].name, tp->args[i].type->fmt);
1299 }
1200 1300
1201#undef SHOW_FIELD 1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1202#define SHOW_FIELD(type, item, name) \
1203 do { \
1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1206 (unsigned int)offsetof(typeof(field), item),\
1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1209 if (!ret) \
1210 return 0; \
1211 } while (0)
1212 1302
1213static int kprobe_event_show_format(struct ftrace_event_call *call, 1303 for (i = 0; i < tp->nr_args; i++) {
1214 struct trace_seq *s) 1304 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1215{ 1305 tp->args[i].name);
1216 struct kprobe_trace_entry field __attribute__((unused)); 1306 }
1217 int ret, i;
1218 struct trace_probe *tp = (struct trace_probe *)call->data;
1219 1307
1220 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1308#undef LEN_OR_ZERO
1221 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1222 1309
1223 /* Show fields */ 1310 /* return the length of print_fmt */
1224 for (i = 0; i < tp->nr_args; i++) 1311 return pos;
1225 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1226 trace_seq_puts(s, "\n");
1227
1228 return __probe_event_show_format(s, tp, "(%lx)",
1229 "REC->" FIELD_STRING_IP);
1230} 1312}
1231 1313
1232static int kretprobe_event_show_format(struct ftrace_event_call *call, 1314static int set_print_fmt(struct trace_probe *tp)
1233 struct trace_seq *s)
1234{ 1315{
1235 struct kretprobe_trace_entry field __attribute__((unused)); 1316 int len;
1236 int ret, i; 1317 char *print_fmt;
1237 struct trace_probe *tp = (struct trace_probe *)call->data;
1238 1318
1239 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1319 /* First: called with 0 length to calculate the needed length */
1240 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1320 len = __set_print_fmt(tp, NULL, 0);
1241 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1321 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1322 if (!print_fmt)
1323 return -ENOMEM;
1242 1324
1243 /* Show fields */ 1325 /* Second: actually write the @print_fmt */
1244 for (i = 0; i < tp->nr_args; i++) 1326 __set_print_fmt(tp, print_fmt, len + 1);
1245 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1327 tp->call.print_fmt = print_fmt;
1246 trace_seq_puts(s, "\n");
1247 1328
1248 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1329 return 0;
1249 "REC->" FIELD_STRING_FUNC
1250 ", REC->" FIELD_STRING_RETIP);
1251} 1330}
1252 1331
1253#ifdef CONFIG_EVENT_PROFILE 1332#ifdef CONFIG_PERF_EVENTS
1254 1333
1255/* Kprobe profile handler */ 1334/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp, 1335static __kprobes void kprobe_perf_func(struct kprobe *kp,
1257 struct pt_regs *regs) 1336 struct pt_regs *regs)
1258{ 1337{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call; 1339 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry; 1340 struct kprobe_trace_entry_head *entry;
1262 struct trace_entry *ent; 1341 struct hlist_head *head;
1263 int size, __size, i, pc, __cpu; 1342 u8 *data;
1264 unsigned long irq_flags; 1343 int size, __size, i;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx; 1344 int rctx;
1268 1345
1269 pc = preempt_count(); 1346 __size = sizeof(*entry) + tp->size;
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1347 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32); 1348 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1274 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1275 return 0; 1351 return;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282
1283 rctx = perf_swevent_get_recursion_context();
1284 if (rctx < 0)
1285 goto end_recursion;
1286 1352
1287 __cpu = smp_processor_id(); 1353 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1288 1354 if (!entry)
1289 if (in_nmi()) 1355 return;
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293
1294 if (!trace_buf)
1295 goto end;
1296
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303 1356
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr; 1357 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1];
1308 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311
1312end:
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316 1361
1317 return 0; 1362 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1318} 1364}
1319 1365
1320/* Kretprobe profile handler */ 1366/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1367static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs) 1368 struct pt_regs *regs)
1323{ 1369{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call; 1371 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry; 1372 struct kretprobe_trace_entry_head *entry;
1327 struct trace_entry *ent; 1373 struct hlist_head *head;
1328 int size, __size, i, pc, __cpu; 1374 u8 *data;
1329 unsigned long irq_flags; 1375 int size, __size, i;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx; 1376 int rctx;
1333 1377
1334 pc = preempt_count(); 1378 __size = sizeof(*entry) + tp->size;
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1379 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32); 1380 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1339 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1340 return 0; 1383 return;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347 1384
1348 rctx = perf_swevent_get_recursion_context(); 1385 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1349 if (rctx < 0) 1386 if (!entry)
1350 goto end_recursion; 1387 return;
1351
1352 __cpu = smp_processor_id();
1353
1354 if (in_nmi())
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1356 else
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1366 entry = (struct kretprobe_trace_entry *)raw_data;
1367 ent = &entry->ent;
1368 1388
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr; 1389 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr; 1390 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1];
1374 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382 1394
1383 return 0; 1395 head = this_cpu_ptr(call->perf_events);
1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1384} 1397}
1385 1398
1386static int probe_profile_enable(struct ftrace_event_call *call) 1399static int probe_perf_enable(struct ftrace_event_call *call)
1387{ 1400{
1388 struct trace_probe *tp = (struct trace_probe *)call->data; 1401 struct trace_probe *tp = (struct trace_probe *)call->data;
1389 1402
@@ -1395,7 +1408,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
1395 return enable_kprobe(&tp->rp.kp); 1408 return enable_kprobe(&tp->rp.kp);
1396} 1409}
1397 1410
1398static void probe_profile_disable(struct ftrace_event_call *call) 1411static void probe_perf_disable(struct ftrace_event_call *call)
1399{ 1412{
1400 struct trace_probe *tp = (struct trace_probe *)call->data; 1413 struct trace_probe *tp = (struct trace_probe *)call->data;
1401 1414
@@ -1408,8 +1421,28 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1408 disable_kprobe(&tp->rp.kp); 1421 disable_kprobe(&tp->rp.kp);
1409 } 1422 }
1410} 1423}
1411#endif /* CONFIG_EVENT_PROFILE */ 1424#endif /* CONFIG_PERF_EVENTS */
1412 1425
1426static __kprobes
1427int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1428{
1429 switch (type) {
1430 case TRACE_REG_REGISTER:
1431 return probe_event_enable(event);
1432 case TRACE_REG_UNREGISTER:
1433 probe_event_disable(event);
1434 return 0;
1435
1436#ifdef CONFIG_PERF_EVENTS
1437 case TRACE_REG_PERF_REGISTER:
1438 return probe_perf_enable(event);
1439 case TRACE_REG_PERF_UNREGISTER:
1440 probe_perf_disable(event);
1441 return 0;
1442#endif
1443 }
1444 return 0;
1445}
1413 1446
1414static __kprobes 1447static __kprobes
1415int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1448int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1418,10 +1451,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1418 1451
1419 if (tp->flags & TP_FLAG_TRACE) 1452 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs); 1453 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE 1454#ifdef CONFIG_PERF_EVENTS
1422 if (tp->flags & TP_FLAG_PROFILE) 1455 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs); 1456 kprobe_perf_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */ 1457#endif
1425 return 0; /* We don't tweek kernel, so just return 0 */ 1458 return 0; /* We don't tweek kernel, so just return 0 */
1426} 1459}
1427 1460
@@ -1432,13 +1465,21 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1432 1465
1433 if (tp->flags & TP_FLAG_TRACE) 1466 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs); 1467 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE 1468#ifdef CONFIG_PERF_EVENTS
1436 if (tp->flags & TP_FLAG_PROFILE) 1469 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs); 1470 kretprobe_perf_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */ 1471#endif
1439 return 0; /* We don't tweek kernel, so just return 0 */ 1472 return 0; /* We don't tweek kernel, so just return 0 */
1440} 1473}
1441 1474
1475static struct trace_event_functions kretprobe_funcs = {
1476 .trace = print_kretprobe_event
1477};
1478
1479static struct trace_event_functions kprobe_funcs = {
1480 .trace = print_kprobe_event
1481};
1482
1442static int register_probe_event(struct trace_probe *tp) 1483static int register_probe_event(struct trace_probe *tp)
1443{ 1484{
1444 struct ftrace_event_call *call = &tp->call; 1485 struct ftrace_event_call *call = &tp->call;
@@ -1446,33 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
1446 1487
1447 /* Initialize ftrace_event_call */ 1488 /* Initialize ftrace_event_call */
1448 if (probe_is_return(tp)) { 1489 if (probe_is_return(tp)) {
1449 tp->event.trace = print_kretprobe_event; 1490 INIT_LIST_HEAD(&call->class->fields);
1450 call->raw_init = probe_event_raw_init; 1491 call->event.funcs = &kretprobe_funcs;
1451 call->show_format = kretprobe_event_show_format; 1492 call->class->raw_init = probe_event_raw_init;
1452 call->define_fields = kretprobe_event_define_fields; 1493 call->class->define_fields = kretprobe_event_define_fields;
1453 } else { 1494 } else {
1454 tp->event.trace = print_kprobe_event; 1495 INIT_LIST_HEAD(&call->class->fields);
1455 call->raw_init = probe_event_raw_init; 1496 call->event.funcs = &kprobe_funcs;
1456 call->show_format = kprobe_event_show_format; 1497 call->class->raw_init = probe_event_raw_init;
1457 call->define_fields = kprobe_event_define_fields; 1498 call->class->define_fields = kprobe_event_define_fields;
1458 } 1499 }
1459 call->event = &tp->event; 1500 if (set_print_fmt(tp) < 0)
1460 call->id = register_ftrace_event(&tp->event); 1501 return -ENOMEM;
1461 if (!call->id) 1502 ret = register_ftrace_event(&call->event);
1503 if (!ret) {
1504 kfree(call->print_fmt);
1462 return -ENODEV; 1505 return -ENODEV;
1463 call->enabled = 0; 1506 }
1464 call->regfunc = probe_event_enable; 1507 call->flags = 0;
1465 call->unregfunc = probe_event_disable; 1508 call->class->reg = kprobe_register;
1466
1467#ifdef CONFIG_EVENT_PROFILE
1468 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable;
1470#endif
1471 call->data = tp; 1509 call->data = tp;
1472 ret = trace_add_event_call(call); 1510 ret = trace_add_event_call(call);
1473 if (ret) { 1511 if (ret) {
1474 pr_info("Failed to register kprobe event: %s\n", call->name); 1512 pr_info("Failed to register kprobe event: %s\n", call->name);
1475 unregister_ftrace_event(&tp->event); 1513 kfree(call->print_fmt);
1514 unregister_ftrace_event(&call->event);
1476 } 1515 }
1477 return ret; 1516 return ret;
1478} 1517}
@@ -1481,6 +1520,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1481{ 1520{
1482 /* tp->event is unregistered in trace_remove_event_call() */ 1521 /* tp->event is unregistered in trace_remove_event_call() */
1483 trace_remove_event_call(&tp->call); 1522 trace_remove_event_call(&tp->call);
1523 kfree(tp->call.print_fmt);
1484} 1524}
1485 1525
1486/* Make a debugfs interface for controling probe points */ 1526/* Make a debugfs interface for controling probe points */
@@ -1523,28 +1563,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1523 1563
1524static __init int kprobe_trace_self_tests_init(void) 1564static __init int kprobe_trace_self_tests_init(void)
1525{ 1565{
1526 int ret; 1566 int ret, warn = 0;
1527 int (*target)(int, int, int, int, int, int); 1567 int (*target)(int, int, int, int, int, int);
1568 struct trace_probe *tp;
1528 1569
1529 target = kprobe_trace_selftest_target; 1570 target = kprobe_trace_selftest_target;
1530 1571
1531 pr_info("Testing kprobe tracing: "); 1572 pr_info("Testing kprobe tracing: ");
1532 1573
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1574 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1575 "$stack $stack0 +0($stack)");
1535 if (WARN_ON_ONCE(ret)) 1576 if (WARN_ON_ONCE(ret)) {
1536 pr_warning("error enabling function entry\n"); 1577 pr_warning("error on probing function entry.\n");
1578 warn++;
1579 } else {
1580 /* Enable trace point */
1581 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1582 if (WARN_ON_ONCE(tp == NULL)) {
1583 pr_warning("error on getting new probe.\n");
1584 warn++;
1585 } else
1586 probe_event_enable(&tp->call);
1587 }
1537 1588
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1589 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval"); 1590 "$retval");
1540 if (WARN_ON_ONCE(ret)) 1591 if (WARN_ON_ONCE(ret)) {
1541 pr_warning("error enabling function return\n"); 1592 pr_warning("error on probing function return.\n");
1593 warn++;
1594 } else {
1595 /* Enable trace point */
1596 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1597 if (WARN_ON_ONCE(tp == NULL)) {
1598 pr_warning("error on getting new probe.\n");
1599 warn++;
1600 } else
1601 probe_event_enable(&tp->call);
1602 }
1603
1604 if (warn)
1605 goto end;
1542 1606
1543 ret = target(1, 2, 3, 4, 5, 6); 1607 ret = target(1, 2, 3, 4, 5, 6);
1544 1608
1545 cleanup_all_probes(); 1609 ret = command_trace_probe("-:testprobe");
1610 if (WARN_ON_ONCE(ret)) {
1611 pr_warning("error on deleting a probe.\n");
1612 warn++;
1613 }
1614
1615 ret = command_trace_probe("-:testprobe2");
1616 if (WARN_ON_ONCE(ret)) {
1617 pr_warning("error on deleting a probe.\n");
1618 warn++;
1619 }
1546 1620
1547 pr_cont("OK\n"); 1621end:
1622 cleanup_all_probes();
1623 if (warn)
1624 pr_cont("NG: Some tests are failed. Please check them.\n");
1625 else
1626 pr_cont("OK\n");
1548 return 0; 1627 return 0;
1549} 1628}
1550 1629
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/slab.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27 28
28#include "trace_output.h" 29#include "trace_output.h"
@@ -33,12 +34,6 @@
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35 36
36/*
37 * For now, let us restrict the no. of symbols traced simultaneously to number
38 * of available hardware breakpoint registers.
39 */
40#define KSYM_TRACER_MAX HBP_NUM
41
42#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
43 38
44struct trace_ksym { 39struct trace_ksym {
@@ -52,7 +47,6 @@ struct trace_ksym {
52 47
53static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
54 49
55static unsigned int ksym_filter_entry_count;
56static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
57 51
58static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -180,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180 struct trace_ksym *entry; 174 struct trace_ksym *entry;
181 int ret = -ENOMEM; 175 int ret = -ENOMEM;
182 176
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry) 178 if (!entry)
192 return -ENOMEM; 179 return -ENOMEM;
@@ -202,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
202 189
203 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
206 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
207 goto err; 199 goto err;
208 } 200 }
209 201
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212 203
213 return 0; 204 return 0;
214 205
@@ -264,7 +255,6 @@ static void __ksym_trace_reset(void)
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) { 256 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu(); 259 synchronize_rcu();
270 kfree(entry); 260 kfree(entry);
@@ -337,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
337 goto out_unlock; 327 goto out_unlock;
338 } 328 }
339 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
340 ksym_filter_entry_count--;
341 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
342 synchronize_rcu(); 331 synchronize_rcu();
343 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/slab.h>
12#include <linux/time.h> 13#include <linux/time.h>
13 14
14#include <asm/atomic.h> 15#include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 209
210 return 1; 210 return 1;
211} 211}
212EXPORT_SYMBOL(trace_seq_putc);
212 213
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 214int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 215{
@@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 254 void *ret;
254 255
255 if (s->full) 256 if (s->full)
256 return 0; 257 return NULL;
257 258
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 259 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 260 s->full = 1;
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 356}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 357EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 358
359const char *
360ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
361{
362 int i;
363 const char *ret = p->buffer + p->len;
364
365 for (i = 0; i < buf_len; i++)
366 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
367
368 trace_seq_putc(p, 0);
369
370 return ret;
371}
372EXPORT_SYMBOL(ftrace_print_hex_seq);
373
358#ifdef CONFIG_KRETPROBES 374#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 375static inline const char *kretprobed(const char *name)
360{ 376{
@@ -726,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
726 if (WARN_ON(!event)) 742 if (WARN_ON(!event))
727 goto out; 743 goto out;
728 744
745 if (WARN_ON(!event->funcs))
746 goto out;
747
729 INIT_LIST_HEAD(&event->list); 748 INIT_LIST_HEAD(&event->list);
730 749
731 if (!event->type) { 750 if (!event->type) {
@@ -758,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
758 goto out; 777 goto out;
759 } 778 }
760 779
761 if (event->trace == NULL) 780 if (event->funcs->trace == NULL)
762 event->trace = trace_nop_print; 781 event->funcs->trace = trace_nop_print;
763 if (event->raw == NULL) 782 if (event->funcs->raw == NULL)
764 event->raw = trace_nop_print; 783 event->funcs->raw = trace_nop_print;
765 if (event->hex == NULL) 784 if (event->funcs->hex == NULL)
766 event->hex = trace_nop_print; 785 event->funcs->hex = trace_nop_print;
767 if (event->binary == NULL) 786 if (event->funcs->binary == NULL)
768 event->binary = trace_nop_print; 787 event->funcs->binary = trace_nop_print;
769 788
770 key = event->type & (EVENT_HASHSIZE - 1); 789 key = event->type & (EVENT_HASHSIZE - 1);
771 790
@@ -807,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
807 * Standard events 826 * Standard events
808 */ 827 */
809 828
810enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 829enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
830 struct trace_event *event)
811{ 831{
812 return TRACE_TYPE_HANDLED; 832 return TRACE_TYPE_HANDLED;
813} 833}
814 834
815/* TRACE_FN */ 835/* TRACE_FN */
816static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 836static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
837 struct trace_event *event)
817{ 838{
818 struct ftrace_entry *field; 839 struct ftrace_entry *field;
819 struct trace_seq *s = &iter->seq; 840 struct trace_seq *s = &iter->seq;
@@ -840,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
840 return TRACE_TYPE_PARTIAL_LINE; 861 return TRACE_TYPE_PARTIAL_LINE;
841} 862}
842 863
843static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 864static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
865 struct trace_event *event)
844{ 866{
845 struct ftrace_entry *field; 867 struct ftrace_entry *field;
846 868
@@ -854,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
854 return TRACE_TYPE_HANDLED; 876 return TRACE_TYPE_HANDLED;
855} 877}
856 878
857static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 879static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
880 struct trace_event *event)
858{ 881{
859 struct ftrace_entry *field; 882 struct ftrace_entry *field;
860 struct trace_seq *s = &iter->seq; 883 struct trace_seq *s = &iter->seq;
@@ -867,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
867 return TRACE_TYPE_HANDLED; 890 return TRACE_TYPE_HANDLED;
868} 891}
869 892
870static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 893static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
894 struct trace_event *event)
871{ 895{
872 struct ftrace_entry *field; 896 struct ftrace_entry *field;
873 struct trace_seq *s = &iter->seq; 897 struct trace_seq *s = &iter->seq;
@@ -880,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
880 return TRACE_TYPE_HANDLED; 904 return TRACE_TYPE_HANDLED;
881} 905}
882 906
883static struct trace_event trace_fn_event = { 907static struct trace_event_functions trace_fn_funcs = {
884 .type = TRACE_FN,
885 .trace = trace_fn_trace, 908 .trace = trace_fn_trace,
886 .raw = trace_fn_raw, 909 .raw = trace_fn_raw,
887 .hex = trace_fn_hex, 910 .hex = trace_fn_hex,
888 .binary = trace_fn_bin, 911 .binary = trace_fn_bin,
889}; 912};
890 913
914static struct trace_event trace_fn_event = {
915 .type = TRACE_FN,
916 .funcs = &trace_fn_funcs,
917};
918
891/* TRACE_CTX an TRACE_WAKE */ 919/* TRACE_CTX an TRACE_WAKE */
892static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 920static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
893 char *delim) 921 char *delim)
@@ -916,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
916 return TRACE_TYPE_HANDLED; 944 return TRACE_TYPE_HANDLED;
917} 945}
918 946
919static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 947static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
948 struct trace_event *event)
920{ 949{
921 return trace_ctxwake_print(iter, "==>"); 950 return trace_ctxwake_print(iter, "==>");
922} 951}
923 952
924static enum print_line_t trace_wake_print(struct trace_iterator *iter, 953static enum print_line_t trace_wake_print(struct trace_iterator *iter,
925 int flags) 954 int flags, struct trace_event *event)
926{ 955{
927 return trace_ctxwake_print(iter, " +"); 956 return trace_ctxwake_print(iter, " +");
928} 957}
@@ -950,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
950 return TRACE_TYPE_HANDLED; 979 return TRACE_TYPE_HANDLED;
951} 980}
952 981
953static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 982static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
983 struct trace_event *event)
954{ 984{
955 return trace_ctxwake_raw(iter, 0); 985 return trace_ctxwake_raw(iter, 0);
956} 986}
957 987
958static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 988static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
989 struct trace_event *event)
959{ 990{
960 return trace_ctxwake_raw(iter, '+'); 991 return trace_ctxwake_raw(iter, '+');
961} 992}
@@ -984,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
984 return TRACE_TYPE_HANDLED; 1015 return TRACE_TYPE_HANDLED;
985} 1016}
986 1017
987static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1018static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1019 struct trace_event *event)
988{ 1020{
989 return trace_ctxwake_hex(iter, 0); 1021 return trace_ctxwake_hex(iter, 0);
990} 1022}
991 1023
992static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1024static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1025 struct trace_event *event)
993{ 1026{
994 return trace_ctxwake_hex(iter, '+'); 1027 return trace_ctxwake_hex(iter, '+');
995} 1028}
996 1029
997static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1030static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
998 int flags) 1031 int flags, struct trace_event *event)
999{ 1032{
1000 struct ctx_switch_entry *field; 1033 struct ctx_switch_entry *field;
1001 struct trace_seq *s = &iter->seq; 1034 struct trace_seq *s = &iter->seq;
@@ -1012,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1012 return TRACE_TYPE_HANDLED; 1045 return TRACE_TYPE_HANDLED;
1013} 1046}
1014 1047
1015static struct trace_event trace_ctx_event = { 1048static struct trace_event_functions trace_ctx_funcs = {
1016 .type = TRACE_CTX,
1017 .trace = trace_ctx_print, 1049 .trace = trace_ctx_print,
1018 .raw = trace_ctx_raw, 1050 .raw = trace_ctx_raw,
1019 .hex = trace_ctx_hex, 1051 .hex = trace_ctx_hex,
1020 .binary = trace_ctxwake_bin, 1052 .binary = trace_ctxwake_bin,
1021}; 1053};
1022 1054
1023static struct trace_event trace_wake_event = { 1055static struct trace_event trace_ctx_event = {
1024 .type = TRACE_WAKE, 1056 .type = TRACE_CTX,
1057 .funcs = &trace_ctx_funcs,
1058};
1059
1060static struct trace_event_functions trace_wake_funcs = {
1025 .trace = trace_wake_print, 1061 .trace = trace_wake_print,
1026 .raw = trace_wake_raw, 1062 .raw = trace_wake_raw,
1027 .hex = trace_wake_hex, 1063 .hex = trace_wake_hex,
1028 .binary = trace_ctxwake_bin, 1064 .binary = trace_ctxwake_bin,
1029}; 1065};
1030 1066
1067static struct trace_event trace_wake_event = {
1068 .type = TRACE_WAKE,
1069 .funcs = &trace_wake_funcs,
1070};
1071
1031/* TRACE_SPECIAL */ 1072/* TRACE_SPECIAL */
1032static enum print_line_t trace_special_print(struct trace_iterator *iter, 1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1033 int flags) 1074 int flags, struct trace_event *event)
1034{ 1075{
1035 struct special_entry *field; 1076 struct special_entry *field;
1036 1077
@@ -1046,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
1046} 1087}
1047 1088
1048static enum print_line_t trace_special_hex(struct trace_iterator *iter, 1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1049 int flags) 1090 int flags, struct trace_event *event)
1050{ 1091{
1051 struct special_entry *field; 1092 struct special_entry *field;
1052 struct trace_seq *s = &iter->seq; 1093 struct trace_seq *s = &iter->seq;
@@ -1061,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1061} 1102}
1062 1103
1063static enum print_line_t trace_special_bin(struct trace_iterator *iter, 1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1064 int flags) 1105 int flags, struct trace_event *event)
1065{ 1106{
1066 struct special_entry *field; 1107 struct special_entry *field;
1067 struct trace_seq *s = &iter->seq; 1108 struct trace_seq *s = &iter->seq;
@@ -1075,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1075 return TRACE_TYPE_HANDLED; 1116 return TRACE_TYPE_HANDLED;
1076} 1117}
1077 1118
1078static struct trace_event trace_special_event = { 1119static struct trace_event_functions trace_special_funcs = {
1079 .type = TRACE_SPECIAL,
1080 .trace = trace_special_print, 1120 .trace = trace_special_print,
1081 .raw = trace_special_print, 1121 .raw = trace_special_print,
1082 .hex = trace_special_hex, 1122 .hex = trace_special_hex,
1083 .binary = trace_special_bin, 1123 .binary = trace_special_bin,
1084}; 1124};
1085 1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1086/* TRACE_STACK */ 1131/* TRACE_STACK */
1087 1132
1088static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1133static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1089 int flags) 1134 int flags, struct trace_event *event)
1090{ 1135{
1091 struct stack_entry *field; 1136 struct stack_entry *field;
1092 struct trace_seq *s = &iter->seq; 1137 struct trace_seq *s = &iter->seq;
@@ -1114,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1114 return TRACE_TYPE_PARTIAL_LINE; 1159 return TRACE_TYPE_PARTIAL_LINE;
1115} 1160}
1116 1161
1117static struct trace_event trace_stack_event = { 1162static struct trace_event_functions trace_stack_funcs = {
1118 .type = TRACE_STACK,
1119 .trace = trace_stack_print, 1163 .trace = trace_stack_print,
1120 .raw = trace_special_print, 1164 .raw = trace_special_print,
1121 .hex = trace_special_hex, 1165 .hex = trace_special_hex,
1122 .binary = trace_special_bin, 1166 .binary = trace_special_bin,
1123}; 1167};
1124 1168
1169static struct trace_event trace_stack_event = {
1170 .type = TRACE_STACK,
1171 .funcs = &trace_stack_funcs,
1172};
1173
1125/* TRACE_USER_STACK */ 1174/* TRACE_USER_STACK */
1126static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1175static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1127 int flags) 1176 int flags, struct trace_event *event)
1128{ 1177{
1129 struct userstack_entry *field; 1178 struct userstack_entry *field;
1130 struct trace_seq *s = &iter->seq; 1179 struct trace_seq *s = &iter->seq;
@@ -1143,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 return TRACE_TYPE_PARTIAL_LINE; 1192 return TRACE_TYPE_PARTIAL_LINE;
1144} 1193}
1145 1194
1146static struct trace_event trace_user_stack_event = { 1195static struct trace_event_functions trace_user_stack_funcs = {
1147 .type = TRACE_USER_STACK,
1148 .trace = trace_user_stack_print, 1196 .trace = trace_user_stack_print,
1149 .raw = trace_special_print, 1197 .raw = trace_special_print,
1150 .hex = trace_special_hex, 1198 .hex = trace_special_hex,
1151 .binary = trace_special_bin, 1199 .binary = trace_special_bin,
1152}; 1200};
1153 1201
1202static struct trace_event trace_user_stack_event = {
1203 .type = TRACE_USER_STACK,
1204 .funcs = &trace_user_stack_funcs,
1205};
1206
1154/* TRACE_BPRINT */ 1207/* TRACE_BPRINT */
1155static enum print_line_t 1208static enum print_line_t
1156trace_bprint_print(struct trace_iterator *iter, int flags) 1209trace_bprint_print(struct trace_iterator *iter, int flags,
1210 struct trace_event *event)
1157{ 1211{
1158 struct trace_entry *entry = iter->ent; 1212 struct trace_entry *entry = iter->ent;
1159 struct trace_seq *s = &iter->seq; 1213 struct trace_seq *s = &iter->seq;
@@ -1178,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1178 1232
1179 1233
1180static enum print_line_t 1234static enum print_line_t
1181trace_bprint_raw(struct trace_iterator *iter, int flags) 1235trace_bprint_raw(struct trace_iterator *iter, int flags,
1236 struct trace_event *event)
1182{ 1237{
1183 struct bprint_entry *field; 1238 struct bprint_entry *field;
1184 struct trace_seq *s = &iter->seq; 1239 struct trace_seq *s = &iter->seq;
@@ -1197,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1197 return TRACE_TYPE_PARTIAL_LINE; 1252 return TRACE_TYPE_PARTIAL_LINE;
1198} 1253}
1199 1254
1255static struct trace_event_functions trace_bprint_funcs = {
1256 .trace = trace_bprint_print,
1257 .raw = trace_bprint_raw,
1258};
1200 1259
1201static struct trace_event trace_bprint_event = { 1260static struct trace_event trace_bprint_event = {
1202 .type = TRACE_BPRINT, 1261 .type = TRACE_BPRINT,
1203 .trace = trace_bprint_print, 1262 .funcs = &trace_bprint_funcs,
1204 .raw = trace_bprint_raw,
1205}; 1263};
1206 1264
1207/* TRACE_PRINT */ 1265/* TRACE_PRINT */
1208static enum print_line_t trace_print_print(struct trace_iterator *iter, 1266static enum print_line_t trace_print_print(struct trace_iterator *iter,
1209 int flags) 1267 int flags, struct trace_event *event)
1210{ 1268{
1211 struct print_entry *field; 1269 struct print_entry *field;
1212 struct trace_seq *s = &iter->seq; 1270 struct trace_seq *s = &iter->seq;
@@ -1225,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 return TRACE_TYPE_PARTIAL_LINE; 1283 return TRACE_TYPE_PARTIAL_LINE;
1226} 1284}
1227 1285
1228static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1286static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1287 struct trace_event *event)
1229{ 1288{
1230 struct print_entry *field; 1289 struct print_entry *field;
1231 1290
@@ -1240,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1240 return TRACE_TYPE_PARTIAL_LINE; 1299 return TRACE_TYPE_PARTIAL_LINE;
1241} 1300}
1242 1301
1243static struct trace_event trace_print_event = { 1302static struct trace_event_functions trace_print_funcs = {
1244 .type = TRACE_PRINT,
1245 .trace = trace_print_print, 1303 .trace = trace_print_print,
1246 .raw = trace_print_raw, 1304 .raw = trace_print_raw,
1247}; 1305};
1248 1306
1307static struct trace_event trace_print_event = {
1308 .type = TRACE_PRINT,
1309 .funcs = &trace_print_funcs,
1310};
1311
1249 1312
1250static struct trace_event *events[] __initdata = { 1313static struct trace_event *events[] __initdata = {
1251 &trace_fn_event, 1314 &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
139{ 138{
140 int ret; 139 int ret;
141 140
142 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
143 if (ret) { 142 if (ret) {
144 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
145 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
146 return ret; 145 return ret;
147 } 146 }
148 147
149 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
150 if (ret) { 149 if (ret) {
151 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
152 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
153 goto fail_deprobe; 152 goto fail_deprobe;
154 } 153 }
155 154
156 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
157 if (ret) { 156 if (ret) {
158 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
162 161
163 return ret; 162 return ret;
164fail_deprobe_wake_new: 163fail_deprobe_wake_new:
165 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
166fail_deprobe: 165fail_deprobe:
167 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
168 return ret; 167 return ret;
169} 168}
170 169
171static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
172{ 171{
173 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
174 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
175 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
176} 175}
177 176
178static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
98 return 1; 98 return 1;
99} 99}
100 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 101static void
102probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 103{
103 if (task != wakeup_task) 104 if (task != wakeup_task)
104 return; 105 return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 108}
108 109
109static void notrace 110static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 111probe_wakeup_sched_switch(void *ignore,
111 struct task_struct *next) 112 struct task_struct *prev, struct task_struct *next)
112{ 113{
113 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 115 cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 201}
201 202
202static void 203static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 204probe_wakeup(void *ignore, struct task_struct *p, int success)
204{ 205{
205 struct trace_array_cpu *data; 206 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 207 int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
264{ 265{
265 int ret; 266 int ret;
266 267
267 ret = register_trace_sched_wakeup(probe_wakeup); 268 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
268 if (ret) { 269 if (ret) {
269 pr_info("wakeup trace: Couldn't activate tracepoint" 270 pr_info("wakeup trace: Couldn't activate tracepoint"
270 " probe to kernel_sched_wakeup\n"); 271 " probe to kernel_sched_wakeup\n");
271 return; 272 return;
272 } 273 }
273 274
274 ret = register_trace_sched_wakeup_new(probe_wakeup); 275 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
275 if (ret) { 276 if (ret) {
276 pr_info("wakeup trace: Couldn't activate tracepoint" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
277 " probe to kernel_sched_wakeup_new\n"); 278 " probe to kernel_sched_wakeup_new\n");
278 goto fail_deprobe; 279 goto fail_deprobe;
279 } 280 }
280 281
281 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 282 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
282 if (ret) { 283 if (ret) {
283 pr_info("sched trace: Couldn't activate tracepoint" 284 pr_info("sched trace: Couldn't activate tracepoint"
284 " probe to kernel_sched_switch\n"); 285 " probe to kernel_sched_switch\n");
285 goto fail_deprobe_wake_new; 286 goto fail_deprobe_wake_new;
286 } 287 }
287 288
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 289 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
289 if (ret) { 290 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint" 291 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n"); 292 " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
312 313
313 return; 314 return;
314fail_deprobe_wake_new: 315fail_deprobe_wake_new:
315 unregister_trace_sched_wakeup_new(probe_wakeup); 316 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
316fail_deprobe: 317fail_deprobe:
317 unregister_trace_sched_wakeup(probe_wakeup); 318 unregister_trace_sched_wakeup(probe_wakeup, NULL);
318} 319}
319 320
320static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
321{ 322{
322 tracer_enabled = 0; 323 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
324 unregister_trace_sched_switch(probe_wakeup_sched_switch); 325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup); 326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup); 327 unregister_trace_sched_wakeup(probe_wakeup, NULL);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 328 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
328} 329}
329 330
330static int __wakeup_tracer_init(struct trace_array *tr) 331static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
3#include <linux/stringify.h> 3#include <linux/stringify.h>
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/slab.h>
6 7
7static inline int trace_valid_entry(struct trace_entry *entry) 8static inline int trace_valid_entry(struct trace_entry *entry)
8{ 9{
@@ -16,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 17 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM: 20 case TRACE_KSYM:
21 return 1; 21 return 1;
22 } 22 }
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
29 struct trace_entry *entry; 29 struct trace_entry *entry;
30 unsigned int loops = 0; 30 unsigned int loops = 0;
31 31
32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
33 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
34 34
35 /* 35 /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
255/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
256#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
257 257
258static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
259static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
260 261
261/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
266 ftrace_graph_stop(); 267 ftrace_graph_stop();
267 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
268 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
269 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
270 return 0; 271 return 0;
271 } 272 }
272 273
@@ -754,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
754} 755}
755#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
756 757
757#ifdef CONFIG_HW_BRANCH_TRACER
758int
759trace_selftest_startup_hw_branches(struct tracer *trace,
760 struct trace_array *tr)
761{
762 struct trace_iterator *iter;
763 struct tracer tracer;
764 unsigned long count;
765 int ret;
766
767 if (!trace->open) {
768 printk(KERN_CONT "missing open function...");
769 return -1;
770 }
771
772 ret = tracer_init(trace, tr);
773 if (ret) {
774 warn_failed_init_tracer(trace, ret);
775 return ret;
776 }
777
778 /*
779 * The hw-branch tracer needs to collect the trace from the various
780 * cpu trace buffers - before tracing is stopped.
781 */
782 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
783 if (!iter)
784 return -ENOMEM;
785
786 memcpy(&tracer, trace, sizeof(tracer));
787
788 iter->trace = &tracer;
789 iter->tr = tr;
790 iter->pos = -1;
791 mutex_init(&iter->mutex);
792
793 trace->open(iter);
794
795 mutex_destroy(&iter->mutex);
796 kfree(iter);
797
798 tracing_stop();
799
800 ret = trace_test_buffer(tr, &count);
801 trace->reset(tr);
802 tracing_start();
803
804 if (!ret && !count) {
805 printk(KERN_CONT "no entries found..");
806 ret = -1;
807 }
808
809 return ret;
810}
811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
815 760
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
174 arch_spin_lock(&max_stack_lock); 184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 arch_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
210 arch_spin_lock(&max_stack_lock); 228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
238 int cpu;
239
220 arch_spin_unlock(&max_stack_lock); 240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h>
13#include <linux/rbtree.h> 14#include <linux/rbtree.h>
14#include <linux/debugfs.h> 15#include <linux/debugfs.h>
15#include "trace_stat.h" 16#include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h>
3#include <linux/kernel.h> 4#include <linux/kernel.h>
4#include <linux/ftrace.h> 5#include <linux/ftrace.h>
5#include <linux/perf_event.h> 6#include <linux/perf_event.h>
@@ -14,6 +15,54 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call)
28{
29 struct syscall_metadata *entry = call->data;
30
31 return &entry->enter_fields;
32}
33
34static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call)
36{
37 struct syscall_metadata *entry = call->data;
38
39 return &entry->exit_fields;
40}
41
42struct trace_event_functions enter_syscall_print_funcs = {
43 .trace = print_syscall_enter,
44};
45
46struct trace_event_functions exit_syscall_print_funcs = {
47 .trace = print_syscall_exit,
48};
49
50struct ftrace_event_class event_class_syscall_enter = {
51 .system = "syscalls",
52 .reg = syscall_enter_register,
53 .define_fields = syscall_enter_define_fields,
54 .get_fields = syscall_get_enter_fields,
55 .raw_init = init_syscall_trace,
56};
57
58struct ftrace_event_class event_class_syscall_exit = {
59 .system = "syscalls",
60 .reg = syscall_exit_register,
61 .define_fields = syscall_exit_define_fields,
62 .get_fields = syscall_get_exit_fields,
63 .raw_init = init_syscall_trace,
64};
65
17extern unsigned long __start_syscalls_metadata[]; 66extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[]; 67extern unsigned long __stop_syscalls_metadata[];
19 68
@@ -52,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
52} 101}
53 102
54enum print_line_t 103enum print_line_t
55print_syscall_enter(struct trace_iterator *iter, int flags) 104print_syscall_enter(struct trace_iterator *iter, int flags,
105 struct trace_event *event)
56{ 106{
57 struct trace_seq *s = &iter->seq; 107 struct trace_seq *s = &iter->seq;
58 struct trace_entry *ent = iter->ent; 108 struct trace_entry *ent = iter->ent;
@@ -67,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
67 if (!entry) 117 if (!entry)
68 goto end; 118 goto end;
69 119
70 if (entry->enter_event->id != ent->type) { 120 if (entry->enter_event->event.type != ent->type) {
71 WARN_ON_ONCE(1); 121 WARN_ON_ONCE(1);
72 goto end; 122 goto end;
73 } 123 }
@@ -104,7 +154,8 @@ end:
104} 154}
105 155
106enum print_line_t 156enum print_line_t
107print_syscall_exit(struct trace_iterator *iter, int flags) 157print_syscall_exit(struct trace_iterator *iter, int flags,
158 struct trace_event *event)
108{ 159{
109 struct trace_seq *s = &iter->seq; 160 struct trace_seq *s = &iter->seq;
110 struct trace_entry *ent = iter->ent; 161 struct trace_entry *ent = iter->ent;
@@ -122,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
122 return TRACE_TYPE_HANDLED; 173 return TRACE_TYPE_HANDLED;
123 } 174 }
124 175
125 if (entry->exit_event->id != ent->type) { 176 if (entry->exit_event->event.type != ent->type) {
126 WARN_ON_ONCE(1); 177 WARN_ON_ONCE(1);
127 return TRACE_TYPE_UNHANDLED; 178 return TRACE_TYPE_UNHANDLED;
128 } 179 }
@@ -143,73 +194,68 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 194 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 195 sizeof(trace.name), is_signed_type(type)
145 196
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 197static
198int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 199{
148 int i; 200 int i;
149 int ret; 201 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 202
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 203 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 204#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 205
206 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 207 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 208 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 209 entry->args[i], sizeof(unsigned long),
163 if (!ret) 210 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 211 }
212 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 213
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 214 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 215 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 216 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 217 }
182 trace_seq_putc(s, '"');
183 218
184 for (i = 0; i < entry->nb_args; i++) { 219#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 220
191 return trace_seq_putc(s, '\n'); 221 /* return the length of print_fmt */
222 return pos;
192} 223}
193 224
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 225static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 226{
196 int ret; 227 char *print_fmt;
197 struct syscall_trace_exit trace; 228 int len;
229 struct syscall_metadata *entry = call->data;
198 230
199 ret = trace_seq_printf(s, 231 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 232 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 233 return 0;
234 }
235
236 /* First: called with 0 length to calculate the needed length */
237 len = __set_enter_print_fmt(entry, NULL, 0);
238
239 print_fmt = kmalloc(len + 1, GFP_KERNEL);
240 if (!print_fmt)
241 return -ENOMEM;
208 242
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 243 /* Second: actually write the @print_fmt */
244 __set_enter_print_fmt(entry, print_fmt, len + 1);
245 call->print_fmt = print_fmt;
246
247 return 0;
248}
249
250static void free_syscall_print_fmt(struct ftrace_event_call *call)
251{
252 struct syscall_metadata *entry = call->data;
253
254 if (entry->enter_event == call)
255 kfree(call->print_fmt);
210} 256}
211 257
212int syscall_enter_define_fields(struct ftrace_event_call *call) 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
213{ 259{
214 struct syscall_trace_enter trace; 260 struct syscall_trace_enter trace;
215 struct syscall_metadata *meta = call->data; 261 struct syscall_metadata *meta = call->data;
@@ -232,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
232 return ret; 278 return ret;
233} 279}
234 280
235int syscall_exit_define_fields(struct ftrace_event_call *call) 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
236{ 282{
237 struct syscall_trace_exit trace; 283 struct syscall_trace_exit trace;
238 int ret; 284 int ret;
@@ -247,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
247 return ret; 293 return ret;
248} 294}
249 295
250void ftrace_syscall_enter(struct pt_regs *regs, long id) 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
251{ 297{
252 struct syscall_trace_enter *entry; 298 struct syscall_trace_enter *entry;
253 struct syscall_metadata *sys_data; 299 struct syscall_metadata *sys_data;
@@ -269,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
269 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 315 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
270 316
271 event = trace_current_buffer_lock_reserve(&buffer, 317 event = trace_current_buffer_lock_reserve(&buffer,
272 sys_data->enter_event->id, size, 0, 0); 318 sys_data->enter_event->event.type, size, 0, 0);
273 if (!event) 319 if (!event)
274 return; 320 return;
275 321
@@ -282,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
282 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 328 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
283} 329}
284 330
285void ftrace_syscall_exit(struct pt_regs *regs, long ret) 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
286{ 332{
287 struct syscall_trace_exit *entry; 333 struct syscall_trace_exit *entry;
288 struct syscall_metadata *sys_data; 334 struct syscall_metadata *sys_data;
@@ -301,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
301 return; 347 return;
302 348
303 event = trace_current_buffer_lock_reserve(&buffer, 349 event = trace_current_buffer_lock_reserve(&buffer,
304 sys_data->exit_event->id, sizeof(*entry), 0, 0); 350 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
305 if (!event) 351 if (!event)
306 return; 352 return;
307 353
@@ -324,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
324 return -ENOSYS; 370 return -ENOSYS;
325 mutex_lock(&syscall_trace_lock); 371 mutex_lock(&syscall_trace_lock);
326 if (!sys_refcount_enter) 372 if (!sys_refcount_enter)
327 ret = register_trace_sys_enter(ftrace_syscall_enter); 373 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
328 if (!ret) { 374 if (!ret) {
329 set_bit(num, enabled_enter_syscalls); 375 set_bit(num, enabled_enter_syscalls);
330 sys_refcount_enter++; 376 sys_refcount_enter++;
@@ -344,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
344 sys_refcount_enter--; 390 sys_refcount_enter--;
345 clear_bit(num, enabled_enter_syscalls); 391 clear_bit(num, enabled_enter_syscalls);
346 if (!sys_refcount_enter) 392 if (!sys_refcount_enter)
347 unregister_trace_sys_enter(ftrace_syscall_enter); 393 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
348 mutex_unlock(&syscall_trace_lock); 394 mutex_unlock(&syscall_trace_lock);
349} 395}
350 396
@@ -358,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
358 return -ENOSYS; 404 return -ENOSYS;
359 mutex_lock(&syscall_trace_lock); 405 mutex_lock(&syscall_trace_lock);
360 if (!sys_refcount_exit) 406 if (!sys_refcount_exit)
361 ret = register_trace_sys_exit(ftrace_syscall_exit); 407 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
362 if (!ret) { 408 if (!ret) {
363 set_bit(num, enabled_exit_syscalls); 409 set_bit(num, enabled_exit_syscalls);
364 sys_refcount_exit++; 410 sys_refcount_exit++;
@@ -378,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
378 sys_refcount_exit--; 424 sys_refcount_exit--;
379 clear_bit(num, enabled_exit_syscalls); 425 clear_bit(num, enabled_exit_syscalls);
380 if (!sys_refcount_exit) 426 if (!sys_refcount_exit)
381 unregister_trace_sys_exit(ftrace_syscall_exit); 427 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
382 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
383} 429}
384 430
@@ -386,12 +432,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
386{ 432{
387 int id; 433 int id;
388 434
389 id = register_ftrace_event(call->event); 435 if (set_syscall_print_fmt(call) < 0)
390 if (!id) 436 return -ENOMEM;
391 return -ENODEV; 437
392 call->id = id; 438 id = trace_event_raw_init(call);
393 INIT_LIST_HEAD(&call->fields); 439
394 return 0; 440 if (id < 0) {
441 free_syscall_print_fmt(call);
442 return id;
443 }
444
445 return id;
446}
447
448unsigned long __init arch_syscall_addr(int nr)
449{
450 return (unsigned long)sys_call_table[nr];
395} 451}
396 452
397int __init init_ftrace_syscalls(void) 453int __init init_ftrace_syscalls(void)
@@ -421,27 +477,24 @@ int __init init_ftrace_syscalls(void)
421} 477}
422core_initcall(init_ftrace_syscalls); 478core_initcall(init_ftrace_syscalls);
423 479
424#ifdef CONFIG_EVENT_PROFILE 480#ifdef CONFIG_PERF_EVENTS
425 481
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 482static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 483static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
428static int sys_prof_refcount_enter; 484static int sys_perf_refcount_enter;
429static int sys_prof_refcount_exit; 485static int sys_perf_refcount_exit;
430 486
431static void prof_syscall_enter(struct pt_regs *regs, long id) 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
432{ 488{
433 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
435 unsigned long flags; 491 struct hlist_head *head;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 492 int syscall_nr;
439 int rctx; 493 int rctx;
440 int size; 494 int size;
441 int cpu;
442 495
443 syscall_nr = syscall_get_nr(current, regs); 496 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 497 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
445 return; 498 return;
446 499
447 sys_data = syscall_nr_to_meta(syscall_nr); 500 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -453,44 +506,24 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
453 size = ALIGN(size + sizeof(u32), sizeof(u64)); 506 size = ALIGN(size + sizeof(u32), sizeof(u64));
454 size -= sizeof(u32); 507 size -= sizeof(u32);
455 508
456 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 509 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
457 "profile buffer not large enough")) 510 "perf buffer not large enough"))
458 return; 511 return;
459 512
460 /* Protect the per cpu buffer, begin the rcu read side */ 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
461 local_irq_save(flags); 514 sys_data->enter_event->event.type, regs, &rctx);
462 515 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 516 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 517
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 518 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
486 521
487end: 522 head = this_cpu_ptr(sys_data->enter_event->perf_events);
488 perf_swevent_put_recursion_context(rctx); 523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
489end_recursion:
490 local_irq_restore(flags);
491} 524}
492 525
493int prof_sysenter_enable(struct ftrace_event_call *call) 526int perf_sysenter_enable(struct ftrace_event_call *call)
494{ 527{
495 int ret = 0; 528 int ret = 0;
496 int num; 529 int num;
@@ -498,47 +531,44 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
498 num = ((struct syscall_metadata *)call->data)->syscall_nr; 531 num = ((struct syscall_metadata *)call->data)->syscall_nr;
499 532
500 mutex_lock(&syscall_trace_lock); 533 mutex_lock(&syscall_trace_lock);
501 if (!sys_prof_refcount_enter) 534 if (!sys_perf_refcount_enter)
502 ret = register_trace_sys_enter(prof_syscall_enter); 535 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
503 if (ret) { 536 if (ret) {
504 pr_info("event trace: Could not activate" 537 pr_info("event trace: Could not activate"
505 "syscall entry trace point"); 538 "syscall entry trace point");
506 } else { 539 } else {
507 set_bit(num, enabled_prof_enter_syscalls); 540 set_bit(num, enabled_perf_enter_syscalls);
508 sys_prof_refcount_enter++; 541 sys_perf_refcount_enter++;
509 } 542 }
510 mutex_unlock(&syscall_trace_lock); 543 mutex_unlock(&syscall_trace_lock);
511 return ret; 544 return ret;
512} 545}
513 546
514void prof_sysenter_disable(struct ftrace_event_call *call) 547void perf_sysenter_disable(struct ftrace_event_call *call)
515{ 548{
516 int num; 549 int num;
517 550
518 num = ((struct syscall_metadata *)call->data)->syscall_nr; 551 num = ((struct syscall_metadata *)call->data)->syscall_nr;
519 552
520 mutex_lock(&syscall_trace_lock); 553 mutex_lock(&syscall_trace_lock);
521 sys_prof_refcount_enter--; 554 sys_perf_refcount_enter--;
522 clear_bit(num, enabled_prof_enter_syscalls); 555 clear_bit(num, enabled_perf_enter_syscalls);
523 if (!sys_prof_refcount_enter) 556 if (!sys_perf_refcount_enter)
524 unregister_trace_sys_enter(prof_syscall_enter); 557 unregister_trace_sys_enter(perf_syscall_enter, NULL);
525 mutex_unlock(&syscall_trace_lock); 558 mutex_unlock(&syscall_trace_lock);
526} 559}
527 560
528static void prof_syscall_exit(struct pt_regs *regs, long ret) 561static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
529{ 562{
530 struct syscall_metadata *sys_data; 563 struct syscall_metadata *sys_data;
531 struct syscall_trace_exit *rec; 564 struct syscall_trace_exit *rec;
532 unsigned long flags; 565 struct hlist_head *head;
533 int syscall_nr; 566 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 567 int rctx;
537 int size; 568 int size;
538 int cpu;
539 569
540 syscall_nr = syscall_get_nr(current, regs); 570 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 571 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
542 return; 572 return;
543 573
544 sys_data = syscall_nr_to_meta(syscall_nr); 574 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -553,45 +583,23 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
553 * Impossible, but be paranoid with the future 583 * Impossible, but be paranoid with the future
554 * How to put this check outside runtime? 584 * How to put this check outside runtime?
555 */ 585 */
556 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 586 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
557 "exit event has grown above profile buffer size")) 587 "exit event has grown above perf buffer size"))
558 return; 588 return;
559 589
560 /* Protect the per cpu buffer, begin the rcu read side */ 590 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
561 local_irq_save(flags); 591 sys_data->exit_event->event.type, regs, &rctx);
562 592 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 593 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 594
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
585 597
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 598 head = this_cpu_ptr(sys_data->exit_event->perf_events);
587 599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 600}
593 601
594int prof_sysexit_enable(struct ftrace_event_call *call) 602int perf_sysexit_enable(struct ftrace_event_call *call)
595{ 603{
596 int ret = 0; 604 int ret = 0;
597 int num; 605 int num;
@@ -599,33 +607,73 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
599 num = ((struct syscall_metadata *)call->data)->syscall_nr; 607 num = ((struct syscall_metadata *)call->data)->syscall_nr;
600 608
601 mutex_lock(&syscall_trace_lock); 609 mutex_lock(&syscall_trace_lock);
602 if (!sys_prof_refcount_exit) 610 if (!sys_perf_refcount_exit)
603 ret = register_trace_sys_exit(prof_syscall_exit); 611 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
604 if (ret) { 612 if (ret) {
605 pr_info("event trace: Could not activate" 613 pr_info("event trace: Could not activate"
606 "syscall entry trace point"); 614 "syscall exit trace point");
607 } else { 615 } else {
608 set_bit(num, enabled_prof_exit_syscalls); 616 set_bit(num, enabled_perf_exit_syscalls);
609 sys_prof_refcount_exit++; 617 sys_perf_refcount_exit++;
610 } 618 }
611 mutex_unlock(&syscall_trace_lock); 619 mutex_unlock(&syscall_trace_lock);
612 return ret; 620 return ret;
613} 621}
614 622
615void prof_sysexit_disable(struct ftrace_event_call *call) 623void perf_sysexit_disable(struct ftrace_event_call *call)
616{ 624{
617 int num; 625 int num;
618 626
619 num = ((struct syscall_metadata *)call->data)->syscall_nr; 627 num = ((struct syscall_metadata *)call->data)->syscall_nr;
620 628
621 mutex_lock(&syscall_trace_lock); 629 mutex_lock(&syscall_trace_lock);
622 sys_prof_refcount_exit--; 630 sys_perf_refcount_exit--;
623 clear_bit(num, enabled_prof_exit_syscalls); 631 clear_bit(num, enabled_perf_exit_syscalls);
624 if (!sys_prof_refcount_exit) 632 if (!sys_perf_refcount_exit)
625 unregister_trace_sys_exit(prof_syscall_exit); 633 unregister_trace_sys_exit(perf_syscall_exit, NULL);
626 mutex_unlock(&syscall_trace_lock); 634 mutex_unlock(&syscall_trace_lock);
627} 635}
628 636
637#endif /* CONFIG_PERF_EVENTS */
638
639static int syscall_enter_register(struct ftrace_event_call *event,
640 enum trace_reg type)
641{
642 switch (type) {
643 case TRACE_REG_REGISTER:
644 return reg_event_syscall_enter(event);
645 case TRACE_REG_UNREGISTER:
646 unreg_event_syscall_enter(event);
647 return 0;
648
649#ifdef CONFIG_PERF_EVENTS
650 case TRACE_REG_PERF_REGISTER:
651 return perf_sysenter_enable(event);
652 case TRACE_REG_PERF_UNREGISTER:
653 perf_sysenter_disable(event);
654 return 0;
629#endif 655#endif
656 }
657 return 0;
658}
630 659
660static int syscall_exit_register(struct ftrace_event_call *event,
661 enum trace_reg type)
662{
663 switch (type) {
664 case TRACE_REG_REGISTER:
665 return reg_event_syscall_exit(event);
666 case TRACE_REG_UNREGISTER:
667 unreg_event_syscall_exit(event);
668 return 0;
631 669
670#ifdef CONFIG_PERF_EVENTS
671 case TRACE_REG_PERF_REGISTER:
672 return perf_sysexit_enable(event);
673 case TRACE_REG_PERF_UNREGISTER:
674 perf_sysexit_disable(event);
675 return 0;
676#endif
677 }
678 return 0;
679}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/slab.h>
12#include <linux/kref.h> 13#include <linux/kref.h>
13#include "trace_stat.h" 14#include "trace_stat.h"
14#include "trace.h" 15#include "trace.h"
@@ -48,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
48 49
49/* Insertion of a work */ 50/* Insertion of a work */
50static void 51static void
51probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
52 struct work_struct *work) 54 struct work_struct *work)
53{ 55{
54 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -69,7 +71,8 @@ found:
69 71
70/* Execution of a work */ 72/* Execution of a work */
71static void 73static void
72probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
73 struct work_struct *work) 76 struct work_struct *work)
74{ 77{
75 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -89,7 +92,8 @@ found:
89} 92}
90 93
91/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
92static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
93{ 97{
94 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
95 unsigned long flags; 99 unsigned long flags;
@@ -113,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
113} 117}
114 118
115/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
116static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
117{ 122{
118 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
119 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -258,19 +263,19 @@ int __init trace_workqueue_early_init(void)
258{ 263{
259 int ret, cpu; 264 int ret, cpu;
260 265
261 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
262 if (ret) 267 if (ret)
263 goto out; 268 goto out;
264 269
265 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
266 if (ret) 271 if (ret)
267 goto no_insertion; 272 goto no_insertion;
268 273
269 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
270 if (ret) 275 if (ret)
271 goto no_execution; 276 goto no_execution;
272 277
273 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
274 if (ret) 279 if (ret)
275 goto no_creation; 280 goto no_creation;
276 281
@@ -282,11 +287,11 @@ int __init trace_workqueue_early_init(void)
282 return 0; 287 return 0;
283 288
284no_creation: 289no_creation:
285 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
286no_execution: 291no_execution:
287 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
288no_insertion: 293no_insertion:
289 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
290out: 295out:
291 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
292 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
21#include <linux/tsacct_kern.h> 21#include <linux/tsacct_kern.h>
22#include <linux/acct.h> 22#include <linux/acct.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * fill in basic accounting fields 27 * fill in basic accounting fields
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -56,9 +55,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 55 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 56 .locked_shm = 0,
58 .user_ns = &init_user_ns, 57 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 58};
63 59
64/* 60/*
@@ -75,268 +71,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 71 put_user_ns(up->user_ns);
76} 72}
77 73
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 74static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 75{
342 struct user_struct *user; 76 struct user_struct *user;
@@ -352,11 +86,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 86 return NULL;
353} 87}
354 88
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 89/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 90 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 91 * upon function exit.
@@ -365,32 +94,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 94{
366 uid_hash_remove(up); 95 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 96 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 97 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 98 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 99 kmem_cache_free(uid_cachep, up);
372} 100}
373 101
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 102/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 103 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 104 * caller must undo that ref with free_uid().
@@ -428,11 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
428 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
429 struct user_struct *up, *new; 137 struct user_struct *up, *new;
430 138
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic.
433 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +148,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 148 new->uid = uid;
446 atomic_set(&new->__count, 1); 149 atomic_set(&new->__count, 1);
447 150
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 151 new->user_ns = get_user_ns(ns);
452 152
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 153 /*
457 * Before adding this, check whether we raced 154 * Before adding this, check whether we raced
458 * on adding the same user already.. 155 * on adding the same user already..
@@ -460,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
460 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
461 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
462 if (up) { 159 if (up) {
463 /* This case is not possible when CONFIG_USER_SCHED
464 * is defined, since we serialize alloc_uid() using
465 * uids_mutex. Hence no need to call
466 * sched_destroy_user() or remove_user_sysfs_dir().
467 */
468 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
469 key_put(new->session_keyring); 161 key_put(new->session_keyring);
470 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -475,17 +167,9 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 167 spin_unlock_irq(&uidhash_lock);
476 } 168 }
477 169
478 uids_mutex_unlock();
479
480 return up; 170 return up;
481 171
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new);
487out_unlock: 172out_unlock:
488 uids_mutex_unlock();
489 return NULL; 173 return NULL;
490} 174}
491 175
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
54#endif 54#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 56
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 57 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 58 put_user_ns(ns);
59 59
60 return 0; 60 return 0;
61} 61}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 784{
775 if (del_timer_sync(&dwork->timer)) { 785 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 786 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 788 __queue_work(cwq, &dwork->work);
779 put_cpu(); 789 put_cpu();
780 } 790 }
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);
@@ -1076,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1076 unsigned int cpu = (unsigned long)hcpu; 1110 unsigned int cpu = (unsigned long)hcpu;
1077 struct cpu_workqueue_struct *cwq; 1111 struct cpu_workqueue_struct *cwq;
1078 struct workqueue_struct *wq; 1112 struct workqueue_struct *wq;
1079 int ret = NOTIFY_OK; 1113 int err = 0;
1080 1114
1081 action &= ~CPU_TASKS_FROZEN; 1115 action &= ~CPU_TASKS_FROZEN;
1082 1116
@@ -1090,12 +1124,13 @@ undo:
1090 1124
1091 switch (action) { 1125 switch (action) {
1092 case CPU_UP_PREPARE: 1126 case CPU_UP_PREPARE:
1093 if (!create_workqueue_thread(cwq, cpu)) 1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1094 break; 1129 break;
1095 printk(KERN_ERR "workqueue [%s] for %i failed\n", 1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1096 wq->name, cpu); 1131 wq->name, cpu);
1097 action = CPU_UP_CANCELED; 1132 action = CPU_UP_CANCELED;
1098 ret = NOTIFY_BAD; 1133 err = -ENOMEM;
1099 goto undo; 1134 goto undo;
1100 1135
1101 case CPU_ONLINE: 1136 case CPU_ONLINE:
@@ -1116,7 +1151,7 @@ undo:
1116 cpumask_clear_cpu(cpu, cpu_populated_map); 1151 cpumask_clear_cpu(cpu, cpu_populated_map);
1117 } 1152 }
1118 1153
1119 return ret; 1154 return notifier_from_errno(err);
1120} 1155}
1121 1156
1122#ifdef CONFIG_SMP 1157#ifdef CONFIG_SMP